# Data Exploration


In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os


In [None]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/data.csv")
ufc_df.head()


In [None]:
#  Infer best column dtype & add missing values to rows
ufc_df = ufc_df.convert_dtypes()


In [None]:
# Convert `winby` column into a category dtype
ufc_df["winby"] = ufc_df["winby"].astype("category")


In [None]:
# Drop the non-beneficial columns
ufc_df = ufc_df.drop(
    columns=[
        "BPrev",
        "RPrev",
        "BStreak",
        "B_Location",
        "R_Location",
        "Event_ID",
        "Fight_ID",
        "B_ID",
        "R_ID",
        "B_HomeTown",
        "R_HomeTown",
        "Date",
    ]
)
ufc_df.head()


In [None]:
# Drops draws and no contest decisions from 'winner' column
# line 6 in the cell below renders this redundant, while this cell renders the 'before and after'
#    in the following cell redundant. We'll keep both for now and clean up later.
ufc_df = ufc_df[(ufc_df.winner != "draw")]
ufc_df = ufc_df[(ufc_df.winner != "no contest")]


In [None]:
# Keep only wins and losses (i.e., Red & Blue)

# Display `value_counts()` on `winner` column before modification
print("Before", "-" * 20, ufc_df.winner.value_counts(), "-" * 20, "\n", sep=os.linesep)

ufc_df = ufc_df.loc[(ufc_df.winner == "blue") | (ufc_df.winner == "red")]

# Display results
print("After", "-" * 20, ufc_df.winner.value_counts(), "-" * 20, sep=os.linesep)


In [None]:
# Pie chart, Red vs. Blue win rate
pie_labels = ufc_df["winner"].value_counts().index
pie_values = ufc_df["winner"].value_counts().values
explode = (0, 0.1)  # only "explode" the 2nd slice

fig1, ax1 = plt.subplots()

ax1.pie(
    x=pie_values,
    labels=pie_labels,
    colors=["red", "blue"],
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=90,
)
ax1.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axes.set_title("Win Rate (Red vs. Blue)")

plt.show()


## Data Exploration: Age


In [None]:
# Find the max value in the Age columns
ufc_df[["R_Age", "B_Age"]].describe()


In [None]:
# Use Box & Whisker plot to visualize outliers
ufc_df.boxplot(column=["R_Age", "B_Age"])


In [None]:
# Create four equal-sized buckets for the Age columns based on sample quantiles

ufc_df["B_Age_Bucket"] = pd.qcut(
    x=ufc_df["B_Age"], q=4, labels=None, retbins=False, precision=3
)


ufc_df["R_Age_Bucket"] = pd.qcut(
    x=ufc_df["R_Age"], q=4, labels=None, retbins=False, precision=3
)

ufc_df.head()


In [None]:
# Create a bar graph to visualize R_Age_Bucket
ufc_df["R_Age_Bucket"].value_counts().plot(
    kind="bar", color="red", rot=0, title="Red Age Buckets"
)


In [None]:
# Create a bar graph to visualize B_Age_Bucket
ufc_df["B_Age_Bucket"].value_counts().plot(
    kind="bar", color="blue", rot=0, title="Blue Age Buckets"
)


In [None]:
# Drop age columns; I do not think we should follow through with bucketing based on age other than for visualizations.
# ufc_df = ufc_df.drop(columns=["R_Age", "B_Age"])


## Data Exploration: Height


In [None]:
# Find the max value in the Height columns
ufc_df[["R_Height", "B_Height"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Height outliers
ufc_df.boxplot(column=["R_Height", "B_Height"])


In [None]:
# Create four equal-sized buckets for the Height columns based on sample quantiles

ufc_df["B_Height_Bucket"] = pd.qcut(
    x=ufc_df["B_Height"], q=4, labels=None, retbins=False, precision=3
)


ufc_df["R_Height_Bucket"] = pd.qcut(
    x=ufc_df["R_Height"], q=4, labels=None, retbins=False, precision=3
)

ufc_df.head()


In [None]:
# Create a bar graph to visualize R_Height_Bucket
ufc_df["R_Height_Bucket"].value_counts().plot(
    kind="bar", color="red", rot=0, title="Red Height Buckets"
)


In [None]:
# Create a bar graph to visualize B_Height_Bucket
ufc_df["B_Height_Bucket"].value_counts().plot(
    kind="bar", color="blue", rot=0, title="Blue Height Buckets"
)


## Data Exploration: Weight


|   Weight Class    | Minimum Weight | Maximum Weight |
| :---------------: | :------------: | :------------: |
|    Heavyweight    |       93       |      120       |
| Light Heavyweight |      83.9      |       93       |
|   Middleweight    |      77.1      |      83.9      |
|   Welterweight    |      70.3      |      77.1      |
|    Lightweight    |      65.8      |      70.3      |
|   Featherweight   |      61.2      |      65.8      |
|   Bantamweight    |      56.7      |      61.2      |
|     Flyweight     |      52.2      |      56.7      |
|   Strawweight\*   |       0        |      52.2      |

- "The women’s UFC division is split into only 4 classes: strawweight, flyweight, bantamweight, and featherweight. The strawweight class is only used in the women’s division and men do not compete in it."


In [None]:
bins = [0, 52.2, 56.7, 61.2, 65.8, 70.3, 77.1, 83.9, 93, 120]
labels = [
    "Strawweight",
    "Flyweight",
    "Bantamweight",
    "Featherweight",
    "Lightweight",
    "Welterweight",
    "Middleweight",
    "Light Heavyweight",
    "Heavyweight",
]


### Red Weight


In [None]:
ufc_df["R_Weight"].describe()


In [None]:
ufc_df.boxplot(column="R_Weight")


In [None]:
# Assign bucket by UFC weight class
ufc_df["R_Weight_Class"] = pd.cut(ufc_df["R_Weight"], bins=bins, labels=labels)


### Blue Weight


In [None]:
ufc_df["B_Weight"].describe()


In [None]:
ufc_df.boxplot(column="B_Weight")


In [None]:
# Assign bucket by UFC weight class
ufc_df["B_Weight_Class"] = pd.cut(ufc_df["B_Weight"], bins=bins, labels=labels)


## Testing

Making sure `R_Weight_Class` == `B_Weight_Class`; we should not need two columns for this.


In [None]:
pd.DataFrame(
    ufc_df["R_Weight_Class"] == ufc_df["B_Weight_Class"]
).value_counts()  # lol.


In [None]:
# TODO: Our current dataset has issues with weight.
# There are a bunch of outputs where the weight classes are not matching up (fighter_r vs. fighter_b).
# The weights are given in kg, but there are no decimal points, so the numbers are rounded.
# Also, the weight classes have historically changed.

# For #2312:
# Fight outcome: http://ufcstats.com/fight-details/397ace87deeb8697
# Matchup stats: http://ufcstats.com/fight-details/06641a8c62e45661

# Is the ufcstats showing the weight they had during that fight or their current weight?

ufc_df[
    ["R_Weight", "B_Weight", "R_Weight_Class", "B_Weight_Class", "R_Name", "B_Name"]
].loc[ufc_df["R_Weight_Class"] != ufc_df["B_Weight_Class"]]


In [None]:
# Save mismatched weightclasses to csv file

# ufc_df.loc[ufc_df["R_Weight_Class"] != ufc_df["B_Weight_Class"]].to_csv(
#     "out.csv",
#     columns=[
#         "R_Weight",
#         "B_Weight",
#         "R_Weight_Class",
#         "B_Weight_Class",
#         "R_Name",
#         "B_Name",
#     ],
# )


In [None]:
# Red weight nulls
print(ufc_df["R_Weight"].isnull().sum())
print(ufc_df["R_Weight_Class"].isnull().sum())

# Blue weight nulls
print(ufc_df["B_Weight"].isnull().sum())
print(ufc_df["B_Weight_Class"].isnull().sum())


# Machine Learning Model


## Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression


numerical_cols = [
    "Last_round",
    "Max_round",
    "B_Height",
    # "B_Weight",
    "R_Height",
    # "R_Weight",
    "B_Age",
    "R_Age",
]
categorical_cols = [
    "winby",
    "B_Weight_Class",
    "R_Weight_Class",
]  # TODO: There should only be a single weight class

# numerical_cols = selector(dtype_include="number")
# categorical_cols = selector(dtype_include="category")

# Imputation transformer to replace missing values using the median along each column.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant")),
        ("scaler", StandardScaler()),
    ]
)

# categorical_transformer = OneHotEncoder(handle_unknown="ignore")
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500)),
    ]
)

X = ufc_df.drop("winner", axis=1)
y = ufc_df["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


### Display Diagram of Pipeline


In [None]:
from sklearn import set_config

set_config(display="diagram")
clf


### Classification Report


In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Model Persistence


In [None]:
# from joblib import dump, load

# dump(clf, "clf.joblib")

# !!! WARNING: DO NOT LOAD RANDOM OBJECTS !!!
# clf = load("clf.joblib")

# # Prediction based on saved pipeline.
# selection = X.iloc[[5]]
# clf.predict(selection)


### Export Dataset


In [None]:
# Export dataset to CSV
ufc_df.to_csv("Resources/new_data.csv", index=False)
