# UFC Fight EDA and Prediction

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


## Load and clean data

In [3]:
data = pd.read_csv('ufc-master.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(60), int64(43), object(14)
memory usage: 5.8+ MB


In [4]:
not_stats = data.select_dtypes(include=['object']).columns
not_stats

Index(['RedFighter', 'BlueFighter', 'Date', 'Location', 'Country', 'Winner',
       'WeightClass', 'Gender', 'BlueStance', 'RedStance', 'BetterRank',
       'Finish', 'FinishDetails', 'FinishRoundTime'],
      dtype='object')

In [5]:
data.drop(columns = ['RedFighter', 'BlueFighter', 'Date', 'Location', 'Country', 'Gender', 'BetterRank','Finish', 'FinishDetails', 'FinishRoundTime'], inplace = True)

In [6]:
data.select_dtypes(include=['object'])

Unnamed: 0,Winner,WeightClass,BlueStance,RedStance
0,Red,Flyweight,Orthodox,Orthodox
1,Red,Welterweight,Orthodox,Orthodox
2,Red,Heavyweight,Orthodox,Orthodox
3,Red,Featherweight,Southpaw,Southpaw
4,Blue,Featherweight,Orthodox,Orthodox
...,...,...,...,...
6523,Blue,Lightweight,Orthodox,Orthodox
6524,Red,Welterweight,Southpaw,Orthodox
6525,Red,Heavyweight,Orthodox,Orthodox
6526,Red,Welterweight,Orthodox,Orthodox


In [7]:
stance_map = {'Orthodox': 0, 'Southpaw': 1, 'Switch': 2, 'Open Stance': 3}
data['BlueStance'] = data['BlueStance'].replace(stance_map)
data['RedStance'] = data['RedStance'].replace(stance_map)

win_map = {'Red': 0, 'Blue': 1}
data['Winner'] = data['Winner'].replace(win_map)
data['Winner'].replace(np.nan, 2, inplace = True)

  data['RedStance'] = data['RedStance'].replace(stance_map)
  data['Winner'] = data['Winner'].replace(win_map)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Winner'].replace(np.nan, 2, inplace = True)


In [8]:
data['Winner'].unique()
data['Winner'].value_counts()

Winner
0    3787
1    2741
Name: count, dtype: int64

In [9]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Columns: 108 entries, RedOdds to BKOOdds
dtypes: bool(1), float64(60), int64(45), object(2)
memory usage: 5.3+ MB


In [10]:
x = data.drop(columns = ['Winner'])
print(x.columns)
y = data['Winner']

Index(['RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue',
       'TitleBout', 'WeightClass', 'NumberOfRounds', 'BlueCurrentLoseStreak',
       'BlueCurrentWinStreak', 'BlueDraws',
       ...
       'BFlyweightRank', 'BPFPRank', 'FinishRound', 'TotalFightTimeSecs',
       'RedDecOdds', 'BlueDecOdds', 'RSubOdds', 'BSubOdds', 'RKOOdds',
       'BKOOdds'],
      dtype='object', length=107)


In [11]:
def fill_nan(df):
    df_filled = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df_filled[numeric_cols] = df_filled.groupby('WeightClass')[numeric_cols].transform(lambda x: x.fillna(x.mean()))

    remaining_na = df_filled[numeric_cols].isna().sum().sum()
    if remaining_na > 0:
        df_filled[numeric_cols] = df_filled[numeric_cols].fillna(df_filled[numeric_cols].mean())
    
    return df_filled

x_filled = fill_nan(x)

x_filled.replace(np.nan, 0, inplace = True)

x_filled_num = x_filled.select_dtypes(include=['float64', 'int64'])


### Get Features

In [12]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=30)
x_new = selector.fit_transform(x_filled_num, y)

selected_features = x_filled_num.columns[selector.get_support()]
print('Selected features:', selected_features)

Selected features: Index(['RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue',
       'BlueCurrentWinStreak', 'BlueAvgSigStrPct', 'BlueAvgTDLanded',
       'BlueLosses', 'RedCurrentLoseStreak', 'RedCurrentWinStreak',
       'RedAvgSigStrPct', 'RedAvgTDLanded', 'RedAvgTDPct', 'RedLosses',
       'RedTotalRoundsFought', 'RedWinsByDecisionSplit', 'RedAge', 'BlueAge',
       'WinStreakDif', 'TotalRoundDif', 'ReachDif', 'SigStrDif',
       'AvgSubAttDif', 'AvgTDDif', 'RedDecOdds', 'BlueDecOdds', 'RSubOdds',
       'BSubOdds', 'RKOOdds', 'BKOOdds'],
      dtype='object')


  f = msb / msw


In [13]:
train_x, test_x, y_train, y_test = train_test_split(x_new, y, test_size = 0.2, random_state = 42)

In [14]:
print(train_x.shape)
print(test_x.shape)
print(y_train.shape)
print(y_test.shape)

(5222, 30)
(1306, 30)
(5222,)
(1306,)


In [15]:
seed = 420
np.random.seed(seed)

## Create models

### Logistic Regression 

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(train_x)
x_test = scaler.transform(test_x)

lr1 = LogisticRegression(max_iter = 1000)
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)
cv_lr = cross_val_score(lr1, x_train, y_train, cv = kfold)
lr_score = cv_lr.mean()

print('Logistic Regression K-fold scores: ', cv_lr)
print('Logistic Regression K-fold mean score: ', lr_score)

Logistic Regression K-fold scores:  [0.68451243 0.63671128 0.66283525 0.66091954 0.64750958 0.70881226
 0.66091954 0.63601533 0.64750958 0.66091954]
Logistic Regression K-fold mean score:  0.6606664322395843


### Decision Tree

In [17]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(random_state = 1)
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)
cv_dt = cross_val_score(dt, x_train, y_train, cv = kfold)
dt_score = cv_dt.mean()

print('Decision Tree K-fold scores: ', cv_dt)
print('Decision Tree K-fold mean score: ', dt_score)

Decision Tree K-fold scores:  [0.583174   0.58699809 0.57471264 0.55555556 0.57662835 0.56896552
 0.59003831 0.57471264 0.57854406 0.58812261]
Decision Tree K-fold mean score:  0.5777451777616609


### K-Nearest Neighbors

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)
cv_knn = cross_val_score(knn, x_train, y_train, cv = kfold)
knn_score = cv_knn.mean()

print('K-Nearest Neighbors K-fold scores: ', cv_knn)
print('K-Nearest Neighbors K-fold mean score: ', knn_score)

K-Nearest Neighbors K-fold scores:  [0.60038241 0.59655832 0.55938697 0.57088123 0.58812261 0.62643678
 0.57088123 0.6091954  0.62068966 0.59961686]
K-Nearest Neighbors K-fold mean score:  0.5942151454546787


### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 1)
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)
cv_rf = cross_val_score(rf, x_train, y_train, cv = kfold)
rf_score = cv_rf.mean()

print('Random Forest K-fold scores: ', cv_rf)
print('Random Forest K-fold mean score: ', rf_score)

Random Forest K-fold scores:  [0.66921606 0.63097514 0.63793103 0.64750958 0.62068966 0.68007663
 0.64750958 0.63793103 0.65517241 0.66475096]
Random Forest K-fold mean score:  0.6491762085814964


### Neural Network

In [20]:
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
y_train_tensor = torch.tensor(encoded_y_train, dtype = torch.long)

encoded_y_test = encoder.transform(y_test)
y_test_tensor = torch.tensor(encoded_y_test, dtype = torch.long)

x_train_tensor = torch.tensor(x_train, dtype = torch.float32)
x_test_tensor = torch.tensor(x_test, dtype = torch.float32)

In [21]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()

        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 32)
        self.layer5 = nn.Linear(32, 2)

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        x = self.dropout(x)
        x = self.layer5(x)

        return F.log_softmax(x, dim = 1)

In [22]:
model = NeuralNetwork(x_train_tensor.shape[1])

In [23]:
criterion  = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [24]:
def train_model(model, x_train, y_train, epochs = 10, batch_size = 32):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

In [25]:
def evaluate(model, x_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(x_test)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_test).sum().item()/len(y_test)
        return accuracy

In [26]:
train_model = train_model(model, x_train_tensor, y_train_tensor, epochs = 500)

Epoch 0, Loss: 0.6983
Epoch 10, Loss: 0.6611
Epoch 20, Loss: 0.6150
Epoch 30, Loss: 0.6109
Epoch 40, Loss: 0.6032
Epoch 50, Loss: 0.5971
Epoch 60, Loss: 0.5896
Epoch 70, Loss: 0.5776
Epoch 80, Loss: 0.5581
Epoch 90, Loss: 0.5362
Epoch 100, Loss: 0.4977
Epoch 110, Loss: 0.4752
Epoch 120, Loss: 0.4445
Epoch 130, Loss: 0.4184
Epoch 140, Loss: 0.3929
Epoch 150, Loss: 0.3802
Epoch 160, Loss: 0.3505
Epoch 170, Loss: 0.3467
Epoch 180, Loss: 0.3263
Epoch 190, Loss: 0.3107
Epoch 200, Loss: 0.3006
Epoch 210, Loss: 0.2756
Epoch 220, Loss: 0.2665
Epoch 230, Loss: 0.2528
Epoch 240, Loss: 0.2437
Epoch 250, Loss: 0.2318
Epoch 260, Loss: 0.2211
Epoch 270, Loss: 0.2156
Epoch 280, Loss: 0.2092
Epoch 290, Loss: 0.1944
Epoch 300, Loss: 0.1849
Epoch 310, Loss: 0.1833
Epoch 320, Loss: 0.1679
Epoch 330, Loss: 0.1609
Epoch 340, Loss: 0.1665
Epoch 350, Loss: 0.1442
Epoch 360, Loss: 0.1359
Epoch 370, Loss: 0.1380
Epoch 380, Loss: 0.1361
Epoch 390, Loss: 0.1516
Epoch 400, Loss: 0.1287
Epoch 410, Loss: 0.1345
Epo

In [27]:
def predict(model, x):
    model.eval()
    with torch.no_grad():
        x = torch.FloatTensor(x)
        outputs = model(x)
        _, predicted = torch.max(outputs.data, 1)
        return predicted

In [28]:
nn_score = evaluate(model, x_test_tensor, y_test_tensor)

print('Neural Network accuracy: ', nn_score)

Neural Network accuracy:  0.5934150076569679


### Best Performing Models

In [29]:
scores = [['Logistic Regression', lr_score],
 ['Random Forest', rf_score],
 ['Decision Tree', dt_score],
 ['K-Nearest Neighbor', knn_score],
 ['Neural Network', nn_score]]

df_scores = pd.DataFrame(scores, columns = ['Model', 'Accuracy'])

df_scores


Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.660666
1,Random Forest,0.649176
2,Decision Tree,0.577745
3,K-Nearest Neighbor,0.594215
4,Neural Network,0.593415


In [30]:
print('x')


x


In [31]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [32]:
seed = 300
np.random.seed(seed)
x_train_cm, x_test_cm, y_train_cm, y_test_cm = train_test_split(x_new, y, test_size = 0.2, random_state = 30)
scaler_cm = StandardScaler()
x_train_cm = scaler_cm.fit_transform(x_train_cm)
x_test_cm = scaler_cm.transform(x_test_cm)

In [33]:
lr_cm = LogisticRegression(max_iter = 1000)
lr_cm.fit(x_train_cm, y_train_cm)
#y_cm = lr_cm.predict(x_test)


: 