# UFC Fight Prediction


In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/scraped_data.csv")
ufc_df


## Data Cleaning


In [None]:
# Dropping duplicate rows. Can happen sometimes due to the new even being scraped.
ufc_df = ufc_df.drop_duplicates(["Event_Date", "B_Name", "R_Name"])


In [None]:
# Converting Event_Date column values to datetime64 to drop older fights below
ufc_df["Event_Date"] = pd.to_datetime(ufc_df["Event_Date"])


In [None]:
# Removing old fights due to the lack of rule consistency
ufc_df = ufc_df[ufc_df["Event_Date"] >= pd.to_datetime("5/3/2001")]


In [None]:
# TODO: Look into what the mixed datatypes are in each column (from the warning message after reading the csv).
# DtypeWarning: Columns (2,97,121,138,141,145,162,165,257,298,301,305,322,325) have mixed types.
# ufc_df.iloc[:,2].unique()
ufc_df.dtypes.value_counts()


In [None]:
# TODO: Fix handling of `--`, `---`, or `No Time Limit`; I was unsure on what each value means so I just set them to NaN.
ufc_df = ufc_df.replace("--", np.NaN)
ufc_df = ufc_df.replace("---", np.NaN)
ufc_df = ufc_df.replace("No Time Limit", np.NaN)


In [None]:
# Splitting in order to get the No contest value
B_Draws_Split = ufc_df["B_Draws"].str.split("(", n=1, expand=True)
ufc_df["B_No_Contest"] = B_Draws_Split[1]

# Keeping only the draws in the column
ufc_df["B_Draws"] = B_Draws_Split[0]


In [None]:
# Stripping the NC and filling "None" with NaN
ufc_df["B_No_Contest"] = ufc_df["B_No_Contest"].str.strip("NC)")
ufc_df["B_No_Contest"].fillna(np.nan, inplace=True)
ufc_df["B_No_Contest"]


In [None]:
# Splitting in order to get the No contest value
R_Draws_Split = ufc_df["R_Draws"].str.split("(", n=1, expand=True)
ufc_df["R_No_Contest"] = R_Draws_Split[1]

# Keeping only the draws in the column
ufc_df["R_Draws"] = R_Draws_Split[0]


In [None]:
# Stripping the NC and filling "None" with NaN
ufc_df["R_No_Contest"] = ufc_df["R_No_Contest"].str.strip("NC)")
ufc_df["R_No_Contest"].fillna(np.nan, inplace=True)
ufc_df["R_No_Contest"]


In [None]:
# NEW COLUMN ORDER
new_column_order = [
    "Event_Date",
    "Weight_Class",
    "Max_Rounds",
    "Ending_Round",
    "Winner",
    "Win_By",
    "B_Name",
    "B_Age",
    "B_Height",
    "B_Weight",
    "B_Reach",
    "B_Stance",
    "B_Wins",
    "B_Losses",
    "B_Draws",
    "B_No_Contest",
    "B_Career_Significant_Strikes_Landed_PM",
    "B_Career_Striking_Accuracy",
    "B_Career_Significant_Strike_Defence",
    "B_Career_Takedown_Average",
    "B_Career_Takedown_Accuracy",
    "B_Career_Takedown_Defence",
    "B_Career_Submission_Average",
    "B_Knockdowns",
    "B_Significant_Strikes_Landed",
    "B_Significant_Strikes_Attempted",
    "B_Significant_Strike_Perc",
    "B_Significant_Strikes_Distance_Landed",
    "B_Significant_Strikes_Distance_Attempted",
    "B_Significant_Strikes_Clinch_Landed",
    "B_Significant_Strikes_Clinch_Attempted",
    "B_Significant_Strikes_Ground_Landed",
    "B_Significant_Strikes_Ground_Attempted",
    "B_Head_Significant_Strikes_Attempted",
    "B_Head_Significant_Strikes_Landed",
    "B_Body_Significant_Strikes_Attempted",
    "B_Body_Significant_Strikes_Landed",
    "B_Leg_Significant_Strikes_Attempted",
    "B_Leg_Significant_Strikes_Landed",
    "B_Total_Strikes_Attempted",
    "B_Total_Strikes_Landed",
    "B_Takedowns_Attempted",
    "B_Takedowns_Landed",
    "B_Takedown_Perc",
    "B_Submission_Attempts",
    "B_Grappling_Reversals",
    "B_Grappling_Control_Time",
    "B_Round_One_Knockdowns",
    "B_Round_One_Significant_Strikes_Landed",
    "B_Round_One_Significant_Strikes_Attempted",
    "B_Round_One_Significant_Strike_Perc",
    "B_Round_One_Significant_Strikes_Distance_Landed",
    "B_Round_One_Significant_Strikes_Distance_Attempted",
    "B_Round_One_Significant_Strikes_Clinch_Landed",
    "B_Round_One_Significant_Strikes_Clinch_Attempted",
    "B_Round_One_Significant_Strikes_Ground_Landed",
    "B_Round_One_Significant_Strikes_Ground_Attempted",
    "B_Round_One_Head_Significant_Strikes_Attempted",
    "B_Round_One_Head_Significant_Strikes_Landed",
    "B_Round_One_Body_Significant_Strikes_Attempted",
    "B_Round_One_Body_Significant_Strikes_Landed",
    "B_Round_One_Leg_Significant_Strikes_Attempted",
    "B_Round_One_Leg_Significant_Strikes_Landed",
    "B_Round_One_Total_Strikes_Attempted",
    "B_Round_One_Total_Strikes_Landed",
    "B_Round_One_Takedowns_Attempted",
    "B_Round_One_Takedowns_Landed",
    "B_Round_One_Takedown_Perc",
    "B_Round_One_Submission_Attempts",
    "B_Round_One_Grappling_Reversals",
    "B_Round_One_Grappling_Control_Time",
    "B_Round_Two_Knockdowns",
    "B_Round_Two_Significant_Strikes_Landed",
    "B_Round_Two_Significant_Strikes_Attempted",
    "B_Round_Two_Significant_Strike_Perc",
    "B_Round_Two_Significant_Strikes_Distance_Landed",
    "B_Round_Two_Significant_Strikes_Distance_Attempted",
    "B_Round_Two_Significant_Strikes_Clinch_Landed",
    "B_Round_Two_Significant_Strikes_Clinch_Attempted",
    "B_Round_Two_Significant_Strikes_Ground_Landed",
    "B_Round_Two_Significant_Strikes_Ground_Attempted",
    "B_Round_Two_Head_Significant_Strikes_Attempted",
    "B_Round_Two_Head_Significant_Strikes_Landed",
    "B_Round_Two_Body_Significant_Strikes_Attempted",
    "B_Round_Two_Body_Significant_Strikes_Landed",
    "B_Round_Two_Leg_Significant_Strikes_Attempted",
    "B_Round_Two_Leg_Significant_Strikes_Landed",
    "B_Round_Two_Total_Strikes_Attempted",
    "B_Round_Two_Total_Strikes_Landed",
    "B_Round_Two_Takedowns_Attempted",
    "B_Round_Two_Takedowns_Landed",
    "B_Round_Two_Takedown_Perc",
    "B_Round_Two_Submission_Attempts",
    "B_Round_Two_Grappling_Reversals",
    "B_Round_Two_Grappling_Control_Time",
    "B_Round_Three_Knockdowns",
    "B_Round_Three_Significant_Strikes_Landed",
    "B_Round_Three_Significant_Strikes_Attempted",
    "B_Round_Three_Significant_Strike_Perc",
    "B_Round_Three_Significant_Strikes_Distance_Landed",
    "B_Round_Three_Significant_Strikes_Distance_Attempted",
    "B_Round_Three_Significant_Strikes_Clinch_Landed",
    "B_Round_Three_Significant_Strikes_Clinch_Attempted",
    "B_Round_Three_Significant_Strikes_Ground_Landed",
    "B_Round_Three_Significant_Strikes_Ground_Attempted",
    "B_Round_Three_Head_Significant_Strikes_Attempted",
    "B_Round_Three_Head_Significant_Strikes_Landed",
    "B_Round_Three_Body_Significant_Strikes_Attempted",
    "B_Round_Three_Body_Significant_Strikes_Landed",
    "B_Round_Three_Leg_Significant_Strikes_Attempted",
    "B_Round_Three_Leg_Significant_Strikes_Landed",
    "B_Round_Three_Total_Strikes_Attempted",
    "B_Round_Three_Total_Strikes_Landed",
    "B_Round_Three_Takedowns_Attempted",
    "B_Round_Three_Takedowns_Landed",
    "B_Round_Three_Takedown_Perc",
    "B_Round_Three_Submission_Attempts",
    "B_Round_Three_Grappling_Reversals",
    "B_Round_Three_Grappling_Control_Time",
    "B_Round_Four_Knockdowns",
    "B_Round_Four_Significant_Strikes_Landed",
    "B_Round_Four_Significant_Strikes_Attempted",
    "B_Round_Four_Significant_Strike_Perc",
    "B_Round_Four_Significant_Strikes_Distance_Landed",
    "B_Round_Four_Significant_Strikes_Distance_Attempted",
    "B_Round_Four_Significant_Strikes_Clinch_Landed",
    "B_Round_Four_Significant_Strikes_Clinch_Attempted",
    "B_Round_Four_Significant_Strikes_Ground_Landed",
    "B_Round_Four_Significant_Strikes_Ground_Attempted",
    "B_Round_Four_Head_Significant_Strikes_Attempted",
    "B_Round_Four_Head_Significant_Strikes_Landed",
    "B_Round_Four_Body_Significant_Strikes_Attempted",
    "B_Round_Four_Body_Significant_Strikes_Landed",
    "B_Round_Four_Leg_Significant_Strikes_Attempted",
    "B_Round_Four_Leg_Significant_Strikes_Landed",
    "B_Round_Four_Total_Strikes_Attempted",
    "B_Round_Four_Total_Strikes_Landed",
    "B_Round_Four_Takedowns_Attempted",
    "B_Round_Four_Takedowns_Landed",
    "B_Round_Four_Takedown_Perc",
    "B_Round_Four_Submission_Attempts",
    "B_Round_Four_Grappling_Reversals",
    "B_Round_Four_Grappling_Control_Time",
    "B_Round_Five_Knockdowns",
    "B_Round_Five_Significant_Strikes_Landed",
    "B_Round_Five_Significant_Strikes_Attempted",
    "B_Round_Five_Significant_Strike_Perc",
    "B_Round_Five_Significant_Strikes_Distance_Landed",
    "B_Round_Five_Significant_Strikes_Distance_Attempted",
    "B_Round_Five_Significant_Strikes_Clinch_Landed",
    "B_Round_Five_Significant_Strikes_Clinch_Attempted",
    "B_Round_Five_Significant_Strikes_Ground_Landed",
    "B_Round_Five_Significant_Strikes_Ground_Attempted",
    "B_Round_Five_Head_Significant_Strikes_Attempted",
    "B_Round_Five_Head_Significant_Strikes_Landed",
    "B_Round_Five_Body_Significant_Strikes_Attempted",
    "B_Round_Five_Body_Significant_Strikes_Landed",
    "B_Round_Five_Leg_Significant_Strikes_Attempted",
    "B_Round_Five_Leg_Significant_Strikes_Landed",
    "B_Round_Five_Total_Strikes_Attempted",
    "B_Round_Five_Total_Strikes_Landed",
    "B_Round_Five_Takedowns_Attempted",
    "B_Round_Five_Takedowns_Landed",
    "B_Round_Five_Takedown_Perc",
    "B_Round_Five_Submission_Attempts",
    "B_Round_Five_Grappling_Reversals",
    "B_Round_Five_Grappling_Control_Time",
    "R_Name",
    "R_Age",
    "R_Height",
    "R_Weight",
    "R_Reach",
    "R_Stance",
    "R_Wins",
    "R_Losses",
    "R_Draws",
    "R_No_Contest",
    "R_Career_Significant_Strikes_Landed_PM",
    "R_Career_Striking_Accuracy",
    "R_Career_Significant_Strike_Defence",
    "R_Career_Takedown_Average",
    "R_Career_Takedown_Accuracy",
    "R_Career_Takedown_Defence",
    "R_Career_Submission_Average",
    "R_Knockdowns",
    "R_Significant_Strikes_Landed",
    "R_Significant_Strikes_Attempted",
    "R_Significant_Strike_Perc",
    "R_Significant_Strikes_Distance_Landed",
    "R_Significant_Strikes_Distance_Attempted",
    "R_Significant_Strikes_Clinch_Landed",
    "R_Significant_Strikes_Clinch_Attempted",
    "R_Significant_Strikes_Ground_Landed",
    "R_Significant_Strikes_Ground_Attempted",
    "R_Head_Significant_Strikes_Attempted",
    "R_Head_Significant_Strikes_Landed",
    "R_Body_Significant_Strikes_Attempted",
    "R_Body_Significant_Strikes_Landed",
    "R_Leg_Significant_Strikes_Attempted",
    "R_Leg_Significant_Strikes_Landed",
    "R_Total_Strikes_Attempted",
    "R_Total_Strikes_Landed",
    "R_Takedowns_Attempted",
    "R_Takedowns_Landed",
    "R_Takedown_Perc",
    "R_Submission_Attempts",
    "R_Grappling_Reversals",
    "R_Grappling_Control_Time",
    "R_Round_One_Knockdowns",
    "R_Round_One_Significant_Strikes_Landed",
    "R_Round_One_Significant_Strikes_Attempted",
    "R_Round_One_Significant_Strike_Perc",
    "R_Round_One_Significant_Strikes_Distance_Attempted",
    "R_Round_One_Significant_Strikes_Distance_Landed",
    "R_Round_One_Significant_Strikes_Clinch_Attempted",
    "R_Round_One_Significant_Strikes_Clinch_Landed",
    "R_Round_One_Significant_Strikes_Ground_Attempted",
    "R_Round_One_Significant_Strikes_Ground_Landed",
    "R_Round_One_Head_Significant_Strikes_Attempted",
    "R_Round_One_Head_Significant_Strikes_Landed",
    "R_Round_One_Body_Significant_Strikes_Attempted",
    "R_Round_One_Body_Significant_Strikes_Landed",
    "R_Round_One_Leg_Significant_Strikes_Attempted",
    "R_Round_One_Leg_Significant_Strikes_Landed",
    "R_Round_One_Total_Strikes_Attempted",
    "R_Round_One_Total_Strikes_Landed",
    "R_Round_One_Takedowns_Attempted",
    "R_Round_One_Takedowns_Landed",
    "R_Round_One_Takedown_Perc",
    "R_Round_One_Submission_Attempts",
    "R_Round_One_Grappling_Reversals",
    "R_Round_One_Grappling_Control_Time",
    "R_Round_Two_Knockdowns",
    "R_Round_Two_Significant_Strikes_Landed",
    "R_Round_Two_Significant_Strikes_Attempted",
    "R_Round_Two_Significant_Strike_Perc",
    "R_Round_Two_Significant_Strikes_Distance_Attempted",
    "R_Round_Two_Significant_Strikes_Distance_Landed",
    "R_Round_Two_Significant_Strikes_Clinch_Attempted",
    "R_Round_Two_Significant_Strikes_Clinch_Landed",
    "R_Round_Two_Significant_Strikes_Ground_Attempted",
    "R_Round_Two_Significant_Strikes_Ground_Landed",
    "R_Round_Two_Head_Significant_Strikes_Attempted",
    "R_Round_Two_Head_Significant_Strikes_Landed",
    "R_Round_Two_Body_Significant_Strikes_Attempted",
    "R_Round_Two_Body_Significant_Strikes_Landed",
    "R_Round_Two_Leg_Significant_Strikes_Attempted",
    "R_Round_Two_Leg_Significant_Strikes_Landed",
    "R_Round_Two_Total_Strikes_Attempted",
    "R_Round_Two_Total_Strikes_Landed",
    "R_Round_Two_Takedowns_Attempted",
    "R_Round_Two_Takedowns_Landed",
    "R_Round_Two_Takedown_Perc",
    "R_Round_Two_Submission_Attempts",
    "R_Round_Two_Grappling_Reversals",
    "R_Round_Two_Grappling_Control_Time",
    "R_Round_Three_Knockdowns",
    "R_Round_Three_Significant_Strikes_Landed",
    "R_Round_Three_Significant_Strikes_Attempted",
    "R_Round_Three_Significant_Strike_Perc",
    "R_Round_Three_Significant_Strikes_Distance_Attempted",
    "R_Round_Three_Significant_Strikes_Distance_Landed",
    "R_Round_Three_Significant_Strikes_Clinch_Attempted",
    "R_Round_Three_Significant_Strikes_Clinch_Landed",
    "R_Round_Three_Significant_Strikes_Ground_Attempted",
    "R_Round_Three_Significant_Strikes_Ground_Landed",
    "R_Round_Three_Head_Significant_Strikes_Attempted",
    "R_Round_Three_Head_Significant_Strikes_Landed",
    "R_Round_Three_Body_Significant_Strikes_Attempted",
    "R_Round_Three_Body_Significant_Strikes_Landed",
    "R_Round_Three_Leg_Significant_Strikes_Attempted",
    "R_Round_Three_Leg_Significant_Strikes_Landed",
    "R_Round_Three_Total_Strikes_Attempted",
    "R_Round_Three_Total_Strikes_Landed",
    "R_Round_Three_Takedowns_Attempted",
    "R_Round_Three_Takedowns_Landed",
    "R_Round_Three_Takedown_Perc",
    "R_Round_Three_Submission_Attempts",
    "R_Round_Three_Grappling_Reversals",
    "R_Round_Three_Grappling_Control_Time",
    "R_Round_Four_Knockdowns",
    "R_Round_Four_Significant_Strikes_Landed",
    "R_Round_Four_Significant_Strikes_Attempted",
    "R_Round_Four_Significant_Strike_Perc",
    "R_Round_Four_Significant_Strikes_Distance_Attempted",
    "R_Round_Four_Significant_Strikes_Distance_Landed",
    "R_Round_Four_Significant_Strikes_Clinch_Attempted",
    "R_Round_Four_Significant_Strikes_Clinch_Landed",
    "R_Round_Four_Significant_Strikes_Ground_Attempted",
    "R_Round_Four_Significant_Strikes_Ground_Landed",
    "R_Round_Four_Head_Significant_Strikes_Attempted",
    "R_Round_Four_Head_Significant_Strikes_Landed",
    "R_Round_Four_Body_Significant_Strikes_Attempted",
    "R_Round_Four_Body_Significant_Strikes_Landed",
    "R_Round_Four_Leg_Significant_Strikes_Attempted",
    "R_Round_Four_Leg_Significant_Strikes_Landed",
    "R_Round_Four_Total_Strikes_Attempted",
    "R_Round_Four_Total_Strikes_Landed",
    "R_Round_Four_Takedowns_Attempted",
    "R_Round_Four_Takedowns_Landed",
    "R_Round_Four_Takedown_Perc",
    "R_Round_Four_Submission_Attempts",
    "R_Round_Four_Grappling_Reversals",
    "R_Round_Four_Grappling_Control_Time",
    "R_Round_Five_Knockdowns",
    "R_Round_Five_Significant_Strikes_Landed",
    "R_Round_Five_Significant_Strikes_Attempted",
    "R_Round_Five_Significant_Strike_Perc",
    "R_Round_Five_Significant_Strikes_Distance_Attempted",
    "R_Round_Five_Significant_Strikes_Distance_Landed",
    "R_Round_Five_Significant_Strikes_Clinch_Attempted",
    "R_Round_Five_Significant_Strikes_Clinch_Landed",
    "R_Round_Five_Significant_Strikes_Ground_Attempted",
    "R_Round_Five_Significant_Strikes_Ground_Landed",
    "R_Round_Five_Head_Significant_Strikes_Attempted",
    "R_Round_Five_Head_Significant_Strikes_Landed",
    "R_Round_Five_Body_Significant_Strikes_Attempted",
    "R_Round_Five_Body_Significant_Strikes_Landed",
    "R_Round_Five_Leg_Significant_Strikes_Attempted",
    "R_Round_Five_Leg_Significant_Strikes_Landed",
    "R_Round_Five_Total_Strikes_Attempted",
    "R_Round_Five_Total_Strikes_Landed",
    "R_Round_Five_Takedowns_Attempted",
    "R_Round_Five_Takedowns_Landed",
    "R_Round_Five_Takedown_Perc",
    "R_Round_Five_Submission_Attempts",
    "R_Round_Five_Grappling_Reversals",
    "R_Round_Five_Grappling_Control_Time",
]


In [None]:
ufc_df = ufc_df.reindex(columns=new_column_order)


In [None]:
#  Infer best column dtype & add missing values to rows
ufc_df = ufc_df.convert_dtypes()
ufc_df.dtypes.value_counts()


In [None]:
ufc_df.head()


In [None]:
# Display the data types of all columns
with pd.option_context('display.max_rows', None):
    print(ufc_df.dtypes)

## Data Exploration


### Winner (Red vs. Blue)


In [None]:
# Pie chart, Red vs. Blue win rate
pie_labels = ufc_df["Winner"].value_counts().index
pie_values = ufc_df["Winner"].value_counts().values
explode = (0, 0.1)  # only "explode" the 2nd slice

fig1, ax1 = plt.subplots()

ax1.pie(
    x=pie_values,
    labels=pie_labels,
    colors=["red", "blue"],
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=90,
)
ax1.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axes.set_title("Win Rate (Red vs. Blue)")

plt.show()


### Age


In [None]:
# Generate descriptive statistics on Age columns (min, max, etc)
ufc_df[["R_Age", "B_Age"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Age outliers
ufc_df.boxplot(column=["R_Age", "B_Age"])


### Height


In [None]:
# Generate descriptive statistics on Height columns (min, max, etc)
ufc_df[["R_Height", "B_Height"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Height outliers
ufc_df.boxplot(column=["R_Height", "B_Height"])


### Weight


In [None]:
# Generate descriptive statistics on Weight columns (min, max, etc)
ufc_df[["R_Weight", "B_Weight"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Weight outliers
ufc_df.boxplot(column=["R_Weight", "B_Weight"])


## Feature Engineering


### Feature Creation


#### Age Bucket


In [None]:
# Create four equal-sized buckets for the Age columns based on sample quantiles

ufc_df["R_Age_Bucket"] = pd.qcut(
    x=ufc_df["R_Age"], q=4, labels=None, retbins=False, precision=3
)


ufc_df["B_Age_Bucket"] = pd.qcut(
    x=ufc_df["B_Age"], q=4, labels=None, retbins=False, precision=3
)


# Drop age columns
# ufc_df = ufc_df.drop(columns=["R_Age", "B_Age"])


In [None]:
# Create a bar graph to visualize R_Age_Bucket
ufc_df["R_Age_Bucket"].value_counts().plot(
    kind="bar", color="red", rot=0, title="Red Age Buckets"
)


In [None]:
# Create a bar graph to visualize B_Age_Bucket
ufc_df["B_Age_Bucket"].value_counts().plot(
    kind="bar", color="blue", rot=0, title="Blue Age Buckets"
)


#### Height Bucket


In [None]:
# Create four equal-sized buckets for the Height columns based on sample quantiles

ufc_df["R_Height_Bucket"] = pd.qcut(
    x=ufc_df["R_Height"], q=4, labels=None, retbins=False, precision=3
)

ufc_df["B_Height_Bucket"] = pd.qcut(
    x=ufc_df["B_Height"], q=4, labels=None, retbins=False, precision=3
)

# Drop height columns
# ufc_df = ufc_df.drop(columns=["R_Height", "B_Height"])


In [None]:
# Create a bar graph to visualize R_Height_Bucket
ufc_df["R_Height_Bucket"].value_counts().plot(
    kind="bar", color="red", rot=0, title="Red Height Buckets"
)


In [None]:
# Create a bar graph to visualize B_Height_Bucket
ufc_df["B_Height_Bucket"].value_counts().plot(
    kind="bar", color="blue", rot=0, title="Blue Height Buckets"
)


#### Weight Class


|   Weight Class    | Minimum Weight (lb) | Maximum Weight (lb) |
| :---------------: | :-----------------: | :-----------------: |
|    Heavyweight    |         205         |         265         |
| Light Heavyweight |         185         |         205         |
|   Middleweight    |         170         |         185         |
|   Welterweight    |         155         |         170         |
|    Lightweight    |         145         |         155         |
|   Featherweight   |         135         |         145         |
|   Bantamweight    |         125         |         135         |
|     Flyweight     |         115         |         125         |
|   Strawweight\*   |          0          |         115         |

- "The women’s UFC division is split into only 4 classes: strawweight, flyweight, bantamweight, and featherweight. The strawweight class is only used in the women’s division and men do not compete in it."


In [None]:
ufc_df[["R_Weight", "B_Weight"]].describe()


In [None]:
# Keep rows that contain standardized weight class
ufc_df = ufc_df.loc[
    (ufc_df.Weight_Class == "Heavyweight")
    | (ufc_df.Weight_Class == "Light Heavyweight")
    | (ufc_df.Weight_Class == "Middleweight")
    | (ufc_df.Weight_Class == "Welterweight")
    | (ufc_df.Weight_Class == "Lightweight")
    | (ufc_df.Weight_Class == "Featherweight")
    | (ufc_df.Weight_Class == "Bantamweight")
    | (ufc_df.Weight_Class == "Flyweight")
    | (ufc_df.Weight_Class == "Strawweight")
    | (ufc_df.Weight_Class == "Women's Strawweight")
    | (ufc_df.Weight_Class == "Women's Flyweight")
    | (ufc_df.Weight_Class == "Women's Bantamweight")
    | (ufc_df.Weight_Class == "Women's Featherweight")
    # TODO: Should we exclude "Catch Weight", "Super Heavyweight", and "Open Weight"?
    # | (ufc_df.Weight_Class == "Catch Weight")
    # | (ufc_df.Weight_Class == "Super Heavyweight")
    # | (ufc_df.Weight_Class == "Open Weight")
    # TODO: Create `other` category for anything that does not fall in Weight_Class (?)
    # | (ufc_df.Winnner == "Other")
]


#### Gender


In [None]:
# Create Gender Fight Class Column

# TODO: There might be a *better* way of determining Gender, but this seems to works.
# ufc_df["Gender"] = np.where(
#     ufc_df["Weight_Class"].str.contains("Women's"), "Female", "Male"
# )

ufc_df["Gender"] = np.where(
    ufc_df["Weight_Class"].str.contains("Women's"), 0, 1
)  # Gender (sex) is 0 if female and 1 if male.


In [None]:
# display(
#     ufc_df.loc[
#         (ufc_df.Weight_Class == "Women's Strawweight")
#         | (ufc_df.Weight_Class == "Women's Flyweight")
#         | (ufc_df.Weight_Class == "Women's Bantamweight")
#         | (ufc_df.Weight_Class == "Women's Featherweight")
#     ].shape[0]
# )

ufc_df["Gender"].value_counts()


#### Body Mass Index (BMI)


[Calculating BMI Using the English System](https://www.cdc.gov/nccdphp/dnpao/growthcharts/training/bmiage/page5_2.html)

Formula:

> **weight** (lb) / [**height** (in)]<sup>2</sup> \* **703**


> "The normal BMI scores may not be accurate if you're very muscular because muscle can add extra kilos, resulting in a high BMI when you're not an unhealthy weight." ([National Health Service](https://www.nhs.uk/conditions/obesity/diagnosis/))


In [None]:
# Divide weight by height twice, multiply by 703, and round to one decimal place

# Calculate Red fighter BMI
ufc_df["R_BMI"] = round(ufc_df["R_Weight"] / ufc_df["R_Height"] ** 2 * 703, 1)

# Calculate Blue fighter BMI
ufc_df["B_BMI"] = round(ufc_df["B_Weight"] / ufc_df["B_Height"] ** 2 * 703, 1)


#### Proposed new BMI


[Proposed formula](https://web.archive.org/web/20220321050055/https://people.maths.ox.ac.uk/trefethen/bmi.html) by Nick Trefethen, Professor of numerical analysis at University of Oxford.

New formula:

> **5734** \* **weight** (lb) / [**height** (in)]<sup>2.5</sup>


In [None]:
# Calculate BMI with the new proposed formula.

# Calculate Red fighter BMI under proposed BMI
ufc_df["R_BMI_proposed"] = round(
    5734 * ufc_df["R_Weight"] / ufc_df["R_Height"] ** 2.5, 1
)

# Calculate Blue fighter BMI under proposed BMI
ufc_df["B_BMI_proposed"] = round(
    5734 * ufc_df["B_Weight"] / ufc_df["B_Height"] ** 2.5, 1
)


#### Estimation of Body Fat


Adult Body Fat Percentage

Formula:

> Body fat % = (1.39 \* BMI) + (0.16 \* Age) - (10.34 \* Gender) - 9)

- Where gender (sex) is 0 if female and 1 if male to account for the lower body fat percentage of men (International Journal of Obesity and Related Metabolic Disorders in 2002)


In [None]:
# Estimate Red Fighter Body Fat Percentage
ufc_df["R_Body_Fat_Percentage"] = (1.39 * ufc_df["R_BMI"]) + (0.16 * ufc_df["R_Age"] - (10.34 * ufc_df["Gender"]) - 9)

# Estimate Blue Fighter Body Fat Percentage
ufc_df["B_Body_Fat_Percentage"] = (1.39 * ufc_df["R_BMI"]) + (0.16 * ufc_df["R_Age"] - (10.34 * ufc_df["Gender"]) - 9)


#### Lean Body Mass


"For calculating lean body mass, body fat percentage was subtracted from 100 to get the lean mass percentage and lean mass percentage was divided by 100 to calculate the decimal for lean mass percentage, and then lean mass decimal was multiplied with total body weight." [(Prakash KO, Choudhary R, Singh G. Lean body mass, body fat percentage, and handgrip strength as predictors of bone mineral density in postmenopausal women. J Mid-life Health 2021;12:299-303)](https://www.jmidlifehealth.org/text.asp?2021/12/4/299/336149)

Formula(s):

> Lean Mass Percentage = 100 - Body Fat Percentage

> Lean Mass Decimal = Lean Mass Percentage / 100

> Lean Body Mass = Lean Mass Decimal \* Total Body Weight

> Lean Body Mass = ((100 - Body Fat Percentage) / 100) \* Total Body Weight


In [None]:
# Estimate Red Fighter Lean Body Mass
ufc_df["R_Lean_Body_Mass"] = ((100 - ufc_df["R_Body_Fat_Percentage"]) / 100) * ufc_df["R_Weight"]

# Estimate Blue Fighter Lean Body Mass
ufc_df["B_Lean_Body_Mass"] = ((100 - ufc_df["B_Body_Fat_Percentage"]) / 100) * ufc_df["B_Weight"]


### Set Categories


In [None]:
# Convert `winby` column into a category dtype
ufc_df["Win_By"] = ufc_df["Win_By"].astype("category")

# Convert stances column into a category dtype
ufc_df["R_Stance"] = ufc_df["R_Stance"].astype("category")
ufc_df["B_Stance"] = ufc_df["B_Stance"].astype("category")

# Convert Weight_Class column into a category dtype
ufc_df["Weight_Class"] = ufc_df["Weight_Class"].astype("category")

# Convert Gender column into a category dtype
ufc_df["Gender"] = ufc_df["Gender"].astype("category")
ufc_df["Gender"] = ufc_df["Gender"].cat.rename_categories({0: "Female", 1: "Male"})


### View Features


In [None]:
# View category dtypes
ufc_df.select_dtypes(include=["category"]).columns.tolist()


In [None]:
# View numerical values
ufc_df.select_dtypes(include=["number"]).columns.tolist()


In [None]:
# TODO: Evaluate why some of these columns are inferred as strings
display(ufc_df.select_dtypes(include=["string"]).columns.tolist())

# TODO: Evaluate why some of these columns are inferred as object
display(ufc_df.select_dtypes(include=["object"]).columns.tolist())


In [None]:
# Converting blue corner values to the correct dtype (Categorical or numeric)
ufc_df = ufc_df.astype(
    {
        "B_Reach": "float",
        "B_Name": "category",
        "B_Significant_Strike_Perc": "float",
        "B_Takedown_Perc": "float",
        "B_Round_One_Significant_Strike_Perc": "float",
        "B_Round_One_Takedown_Perc": "float",
        "B_Round_Two_Significant_Strike_Perc": "float",
        "B_Round_Two_Takedown_Perc": "float",
        "B_Round_Three_Significant_Strike_Perc": "float",
        "B_Round_Three_Takedown_Perc": "float",
        "B_Round_Four_Takedown_Perc": "float",
        "B_Round_Five_Takedown_Perc": "float",
        "B_Round_Four_Significant_Strike_Perc": "float",
        "B_Round_Five_Significant_Strike_Perc": "float",
        "B_Draws": "float",
        "B_No_Contest": "float",
    }
)


In [None]:
# Converting red corner values to the correct dtype (Categorical or numeric)
ufc_df = ufc_df.astype(
    {
        "R_Name": "category",
        "R_Reach": "float",
        "R_Significant_Strike_Perc": "float",
        "R_Takedown_Perc": "float",
        "R_Round_One_Significant_Strike_Perc": "float",
        "R_Round_One_Takedown_Perc": "float",
        "R_Round_Two_Significant_Strike_Perc": "float",
        "R_Round_Two_Takedown_Perc": "float",
        "R_Round_Three_Significant_Strike_Perc": "float",
        "R_Round_Three_Takedown_Perc": "float",
        "R_Round_Four_Takedown_Perc": "float",
        "R_Round_Five_Takedown_Perc": "float",
        "R_Round_Five_Significant_Strike_Perc": "float",
        "R_Draws": "float",
        "R_No_Contest": "float",
    }
)


In [None]:
ufc_df.to_csv("Resources/clean_scraped_data.csv")

### Select Features


In [None]:
numerical_cols = [
    "Max_Rounds",
    "Ending_Round",
    "B_Age",
    "R_Age",
    "B_Reach",
    "R_Reach",
    "B_Height",
    "R_Height",
    "B_Weight",
    "R_Weight",
    "R_BMI",
    "B_BMI",
    # "R_BMI_proposed",
    # "B_BMI_proposed",
    "R_Body_Fat_Percentage",
    "B_Body_Fat_Percentage",
    "R_Lean_Body_Mass",
    "B_Lean_Body_Mass",
    "B_Career_Significant_Strikes_Landed_PM",
    "R_Career_Significant_Strikes_Landed_PM",
    "R_Career_Striking_Accuracy",
    "R_Career_Significant_Strike_Defence",
    "R_Career_Takedown_Average",
    "R_Career_Takedown_Accuracy",
    "R_Career_Takedown_Defence",
    "R_Career_Submission_Average",
    "B_Career_Striking_Accuracy",
    "B_Career_Significant_Strike_Defence",
    "B_Career_Takedown_Average",
    "B_Career_Takedown_Accuracy",
    "B_Career_Takedown_Defence",
    "B_Career_Submission_Average",
]
categorical_cols = ["Win_By", "B_Stance", "R_Stance", "Weight_Class", "Gender"]

# numerical_cols = selector(dtype_include="number")
# categorical_cols = selector(dtype_include="category")

target = "Winner"


## Split Train-Test


In [None]:
X = ufc_df.drop(target, axis=1)
y = ufc_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


## Machine Learning Model


### Logistic Regression Pipeline


In [None]:
# Imputation transformer to replace missing values using null values along each column.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", add_indicator=True)),
        ("scaler", StandardScaler()),
    ]
)

# categorical_transformer = OneHotEncoder(handle_unknown="ignore")
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500)),
    ]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


### Display Diagram of Pipeline


In [None]:
set_config(display="diagram")
clf


### Classification Report


In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Confusion Matrix


In [None]:
disp = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, cmap="Blues", values_format="d"
)
plt.show()


### Model Persistence


In [None]:
# from joblib import dump, load

# dump(clf, "clf.joblib")

# !!! WARNING: DO NOT LOAD RANDOM OBJECTS !!!
# clf = load("clf.joblib")

# # Prediction based on saved pipeline.
# selection = X.iloc[[5]]
# clf.predict(selection)


## Export Dataset


In [None]:
# Export dataset to CSV
ufc_df.to_csv("Resources/new_data.csv", index=False)
