In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os


In [None]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/data.csv")
ufc_df.head()


In [None]:
# Drop the non-beneficial columns
ufc_df = ufc_df.drop(columns=["B_Location", "R_Location", "Event_ID", "Fight_ID", "B_Name", "R_Name", "BStreak",
                             "R_ID"])
ufc_df.head()


In [7]:
# Drop the BStreak column since there is no accompanying RStreak column
ufc_df = ufc_df.drop(columns="BStreak")
ufc_df.head()

Unnamed: 0,BPrev,B_Age,B_Height,B_HomeTown,B_ID,B_Name,B_Weight,B__Round1_Grappling_Reversals_Landed,B__Round1_Grappling_Standups_Landed,B__Round1_Grappling_Submissions_Attempts,...,R__Round5_TIP_Ground Time,R__Round5_TIP_Guard Control Time,R__Round5_TIP_Half Guard Control Time,R__Round5_TIP_Misc. Ground Control Time,R__Round5_TIP_Mount Control Time,R__Round5_TIP_Neutral Time,R__Round5_TIP_Side Control Time,R__Round5_TIP_Standing Time,winby,winner
0,0,38.0,193.0,Hounslow England,808,Alistair Overeem,120.0,,,,...,,,,,,,,,DEC,blue
1,0,36.0,172.0,"Chicago, Illinois United States",1054,Ricardo Lamas,65.0,,,,...,,,,,,,,,DEC,red
2,0,39.0,167.0,"Isla Vista , California USA",959,Urijah Faber,61.0,,,,...,,,,,,,,,KO/TKO,red
3,0,33.0,167.0,"San Diego, CA USA",1056,Danny Martinez,56.0,,,,...,,,,,,,,,DEC,red
4,0,36.0,185.0,Southampton England,2005,Tom Watson,84.0,,,,...,,,,,,,,,DEC,red


In [None]:
# Keep only wins and losses (i.e., Red & Blue)

# Display `value_counts()` on `winner` column before modification
print("Before", "-" * 20, ufc_df.winner.value_counts(), "-" * 20, "\n", sep=os.linesep)

ufc_df = ufc_df.loc[(ufc_df.winner == "blue") | (ufc_df.winner == "red")]

# Display results
print("After", "-" * 20, ufc_df.winner.value_counts(), "-" * 20, sep=os.linesep)


In [None]:
# Pie chart, Red vs. Blue win rate
pie_labels = ufc_df["winner"].value_counts().index
pie_values = ufc_df["winner"].value_counts().values
explode = (0, 0.1)  # only "explode" the 2nd slice

fig1, ax1 = plt.subplots()

ax1.pie(
    x=pie_values,
    labels=pie_labels,
    colors=["red", "blue"],
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=90,
)
ax1.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axes.set_title("Win Rate (Red vs. Blue)")

plt.show()


## Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

# Imputation transformer for completing missing values.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500)),
    ]
)

X = ufc_df.drop("winner", axis=1)
y = ufc_df["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


### Display Diagram of Pipeline


In [None]:
from sklearn import set_config

set_config(display="diagram")
clf


### Classification Report


In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


## Regression without ColumnTransformer

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Split features and target arrays

# Binary encoding
dummies_df = pd.get_dummies(ufc_df)
X = dummies_df.drop(columns=['winner_blue', 'winner_red'])
X = X.fillna(0)

y = ufc_df['winner']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=1)

# Scaling the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create and train the Logistic Regression Model
clf = LogisticRegression(solver="lbfgs",max_iter=200)
clf.fit(X_train_scaled,y_train)

# Predict Outcomes
y_pred = clf.predict(X_test_scaled)

# Print accuracy score
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")