## UFC Fight Model

In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/clean_scraped_data.csv", low_memory=False)
ufc_df

Unnamed: 0.1,Unnamed: 0,Event_Date,Weight_Class,Max_Rounds,Ending_Round,Winner,Win_By,B_Name,B_Age,B_Height,...,B_Height_Bucket,Gender,R_BMI,B_BMI,R_BMI_proposed,B_BMI_proposed,R_Body_Fat_Percentage,B_Body_Fat_Percentage,R_Lean_Body_Mass,B_Lean_Body_Mass
0,0,2022-03-19,Heavyweight,5,1,Blue,SUB,Tom Aspinall,28,77,...,"(73.0, 83.0]",Male,28.2,30.4,25.8,28.2,25.138,25.138,187.15500,191.64672
1,1,2022-03-19,Featherweight,3,1,Red,KO/TKO,Dan Hooker,32,72,...,"(70.0, 73.0]",Male,22.0,19.7,21.8,18.9,15.720,15.720,122.20600,122.20600
2,2,2022-03-19,Lightweight,3,1,Red,SUB,Kazula Vargas,36,68,...,"(59.999, 68.0]",Male,22.2,23.6,21.7,23.3,15.838,15.838,130.45110,130.45110
3,3,2022-03-19,Welterweight,3,3,Red,DEC,Takashi Sato,31,70,...,"(68.0, 70.0]",Male,23.7,24.4,22.9,23.8,18.883,18.883,137.89890,137.89890
4,4,2022-03-19,Women's Flyweight,3,3,Red,KO/TKO,Luana Carolina,28,66,...,"(59.999, 68.0]",Female,21.5,20.2,21.9,20.3,25.845,25.845,92.69375,92.69375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5714,6095,2001-05-04,Welterweight,3,3,Red,KO/TKO,Matt Serra,26,66,...,"(59.999, 68.0]",Male,24.4,27.4,23.8,27.5,19.216,19.216,137.33280,137.33280
5715,6096,2001-05-04,Heavyweight,3,2,Red,KO/TKO,Pete Williams,25,75,...,"(73.0, 83.0]",Male,27.7,29.4,25.0,27.7,23.483,23.483,202.77005,179.81495
5716,6097,2001-05-04,Light Heavyweight,3,3,Red,,Ricardo Almeida,24,72,...,"(70.0, 73.0]",Male,25.1,23.1,24.1,22.2,20.349,20.349,147.35435,135.40670
5717,6098,2001-05-04,Lightweight,3,1,Red,KO/TKO,Joey Gilbert,24,71,...,"(70.0, 73.0]",Male,22.9,21.6,22.5,20.9,16.011,16.011,130.18295,130.18295


### Select Features


In [None]:
# numerical_cols = [
    # "Max_Rounds",
    # "Ending_Round",
    # "B_Age",
    # "R_Age",
    # "B_Reach",
    # "R_Reach",
    # "B_Height",
    # "R_Height",
    # "B_Weight",
    # "R_Weight",
    # "R_BMI",
    # "B_BMI",
    # "R_BMI_proposed",
    # "B_BMI_proposed",
    # "R_Body_Fat_Percentage",
    # "B_Body_Fat_Percentage",
    # "R_Lean_Body_Mass",
    # "B_Lean_Body_Mass",
    # "B_Career_Significant_Strikes_Landed_PM",
    # "R_Career_Significant_Strikes_Landed_PM",
    # "R_Career_Striking_Accuracy",
    # "R_Career_Significant_Strike_Defence",
    # "R_Career_Takedown_Average",
    # "R_Career_Takedown_Accuracy",
    # "R_Career_Takedown_Defence",
    # "R_Career_Submission_Average",
    # "B_Career_Striking_Accuracy",
    # "B_Career_Significant_Strike_Defence",
    # "B_Career_Takedown_Average",
    # "B_Career_Takedown_Accuracy",
    # "B_Career_Takedown_Defence",
    # "B_Career_Submission_Average",
# ]
# categorical_cols = ["Win_By", "B_Stance", "R_Stance", "Weight_Class", "Gender"]

numerical_cols = selector(dtype_include="number")
categorical_cols = selector(dtype_include="category")

# target = "Winner"


## Split Train-Test


In [None]:
X = ufc_df.drop(target, axis=1)
y = ufc_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


## Machine Learning Model


### Logistic Regression Pipeline


In [None]:
# Imputation transformer to replace missing values using null values along each column.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", add_indicator=True)),
        ("scaler", StandardScaler()),
    ]
)

# categorical_transformer = OneHotEncoder(handle_unknown="ignore")
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500)),
    ]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


### Display Diagram of Pipeline


In [None]:
set_config(display="diagram")
clf


### Classification Report


In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Confusion Matrix


In [None]:
disp = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, cmap="Blues", values_format="d"
)
plt.show()


### Model Persistence


In [None]:
# from joblib import dump, load

# dump(clf, "clf.joblib")

# !!! WARNING: DO NOT LOAD RANDOM OBJECTS !!!
# clf = load("clf.joblib")

# # Prediction based on saved pipeline.
# selection = X.iloc[[5]]
# clf.predict(selection)
