In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os


In [2]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/kaggle_data.csv")
ufc_df.head()


Unnamed: 0,BPrev,BStreak,B_Age,B_Height,B_HomeTown,B_ID,B_Location,B_Name,B_Weight,B__Round1_Grappling_Reversals_Landed,...,R__Round5_TIP_Ground Time,R__Round5_TIP_Guard Control Time,R__Round5_TIP_Half Guard Control Time,R__Round5_TIP_Misc. Ground Control Time,R__Round5_TIP_Mount Control Time,R__Round5_TIP_Neutral Time,R__Round5_TIP_Side Control Time,R__Round5_TIP_Standing Time,winby,winner
0,0,0,38.0,193.0,Hounslow England,808,Amsterdam The Netherlands,Alistair Overeem,120.0,,...,,,,,,,,,DEC,blue
1,0,0,36.0,172.0,"Chicago, Illinois United States",1054,"Chicago, Illinois United States",Ricardo Lamas,65.0,,...,,,,,,,,,DEC,red
2,0,0,39.0,167.0,"Isla Vista , California USA",959,"Sacramento, California USA",Urijah Faber,61.0,,...,,,,,,,,,KO/TKO,red
3,0,0,33.0,167.0,"San Diego, CA USA",1056,"San Diego, CA USA",Danny Martinez,56.0,,...,,,,,,,,,DEC,red
4,0,0,36.0,185.0,Southampton England,2005,Southampton England,Tom Watson,84.0,,...,,,,,,,,,DEC,red


In [3]:
# Drop the non-beneficial columns
ufc_df = ufc_df.drop(
    columns=[
        "BPrev",
        "RPrev",
        "BStreak",
        "B_Location",
        "R_Location",
        "Event_ID",
        "Fight_ID",
        "B_ID",
        "R_ID",
        "B_HomeTown",
        "R_HomeTown",
        "Date",
    ]
)
ufc_df.head()


Unnamed: 0,B_Age,B_Height,B_Name,B_Weight,B__Round1_Grappling_Reversals_Landed,B__Round1_Grappling_Standups_Landed,B__Round1_Grappling_Submissions_Attempts,B__Round1_Grappling_Takedowns_Attempts,B__Round1_Grappling_Takedowns_Landed,B__Round1_Strikes_Body Significant Strikes_Attempts,...,R__Round5_TIP_Ground Time,R__Round5_TIP_Guard Control Time,R__Round5_TIP_Half Guard Control Time,R__Round5_TIP_Misc. Ground Control Time,R__Round5_TIP_Mount Control Time,R__Round5_TIP_Neutral Time,R__Round5_TIP_Side Control Time,R__Round5_TIP_Standing Time,winby,winner
0,38.0,193.0,Alistair Overeem,120.0,,,,,,,...,,,,,,,,,DEC,blue
1,36.0,172.0,Ricardo Lamas,65.0,,,,,,,...,,,,,,,,,DEC,red
2,39.0,167.0,Urijah Faber,61.0,,,,,,,...,,,,,,,,,KO/TKO,red
3,33.0,167.0,Danny Martinez,56.0,,,,,,,...,,,,,,,,,DEC,red
4,36.0,185.0,Tom Watson,84.0,,,,,,,...,,,,,,,,,DEC,red


In [4]:
# Keep only wins and losses (i.e., Red & Blue)
ufc_df = ufc_df.loc[(ufc_df.winner == "blue") | (ufc_df.winner == "red")]


## Logistic Regression


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

# Imputation transformer for completing missing values.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500, random_state=1)),
    ]
)

X = ufc_df.drop("winner", axis=1)
y = ufc_df["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


model score: 0.531


### Display Diagram of Pipeline


In [6]:
from sklearn import set_config

set_config(display="diagram")
clf


### Classification Report


In [7]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        blue       0.44      0.36      0.40       195
         red       0.58      0.66      0.62       261

    accuracy                           0.53       456
   macro avg       0.51      0.51      0.51       456
weighted avg       0.52      0.53      0.52       456



## Grid Search


Grid search can also be performed on the different preprocessing steps defined in the `ColumnTransformer` object, together with the classifier’s hyperparameters as part of the `Pipeline`.


Search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression using GridSearchCV.


In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "preprocessor__num__imputer__strategy": [
        "mean",
        "median",
        "most_frequent",
        "constant",
    ],
    "classifier__C": [0.1, 1.0, 10, 100],
    "classifier__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
}


grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)  # n_jobs=-1 for all cores
grid_search


Calling `grid_search.fit` triggers the cross-validated search for the best hyper-parameters combination:


In [9]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)


Best params:
{'classifier__C': 0.1, 'classifier__solver': 'saga', 'preprocessor__num__imputer__strategy': 'most_frequent'}


In [10]:
# grid_search.get_params().keys()

# for parameter in clf.get_params():
#     print(parameter)

# clf.get_params()

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# Solver: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The internal cross-validation scores obtained by those parameters is:


In [11]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")


Internal CV score: 0.590


We can also introspect the top grid search results as a pandas dataframe:


In [12]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_preprocessor__num__imputer__strategy",
        "param_classifier__C",
        "param_classifier__solver",
    ]
].head(20)


Unnamed: 0,mean_test_score,std_test_score,param_preprocessor__num__imputer__strategy,param_classifier__C,param_classifier__solver
18,0.590012,0.00986,most_frequent,0.1,saga
37,0.587807,0.008626,median,1.0,saga
57,0.587259,0.008617,median,10.0,saga
77,0.587259,0.008617,median,100.0,saga
38,0.584525,0.012382,most_frequent,1.0,saga
39,0.583983,0.0213,constant,1.0,saga
78,0.583976,0.01228,most_frequent,100.0,saga
58,0.583976,0.01228,most_frequent,10.0,saga
59,0.583435,0.02084,constant,10.0,saga
79,0.583435,0.02084,constant,100.0,saga


The best hyper-parameters have be used to re-fit a final model on the full training set. Evaluate that final model on held out test data that was not used for hyperparameter tuning.


In [13]:
print(
    (
        "best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)


best logistic regression from grid search: 0.577
