In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
preprocessed_data = pd.read_csv(dirname + '/preprocessed_data.csv')
raw_data = pd.read_csv(dirname + '/data.csv')
nfighters = pd.concat([raw_data['R_fighter'], raw_data['B_fighter']], axis=0, ignore_index=True)
print('Number of unique fighters: {}'.format(nfighters.nunique()))


Next we look at the features of the dataset

In [None]:
str(raw_data.columns)

We only want to include numerical features such as strikes landed, takedowns, etc.  For this we will exclude categorical columns as well as fight record (wins, stoppage, etc).  Additionally we want to retain the fighter names for future predictions. 

we will take features:

R_fighter
B_fighter
Winner
B_current_lose_streak
B_current_win_streak
B_avg_BODY_att
B_avg_BODY_landed
B_avg_CLINCH_att
B_avg_CLINCH_landed
B_avg_DISTANCE_att
B_avg_DISTANCE_landed
B_avg_GROUND_att
B_avg_GROUND_landed
B_avg_HEAD_att
B_avg_HEAD_landed
B_avg_KD
B_avg_LEG_att
B_avg_LEG_landed
B_avg_PASS
B_avg_REV
B_avg_SIG_STR_att
B_avg_SIG_STR_landed
B_avg_SIG_STR_pct
B_avg_SUB_ATT
B_avg_TD_att
B_avg_TD_landed
R_current_lose_streak
R_current_win_streak
R_avg_BODY_att
R_avg_BODY_landed
R_avg_CLINCH_att
R_avg_CLINCH_landed
R_avg_DISTANCE_att
R_avg_DISTANCE_landed
R_avg_GROUND_att
R_avg_GROUND_landed
R_avg_HEAD_att
R_avg_HEAD_landed
R_avg_KD
R_avg_LEG_att
R_avg_LEG_landed
R_avg_PASS
R_avg_REV
R_avg_SIG_STR_att
R_avg_SIG_STR_landed
R_avg_SIG_STR_pct
R_avg_SUB_ATT
R_avg_TD_att
R_avg_TD_landed

Once the chosen features have been selected, we map the data frame 'Winner' to a 0 or 1 with 1 indicating that R_fighter won

In [None]:
features = 'R_fighter \
B_fighter \
Winner \
B_avg_BODY_att \
B_avg_BODY_landed \
B_avg_CLINCH_att \
B_avg_CLINCH_landed \
B_avg_DISTANCE_att \
B_avg_DISTANCE_landed \
B_avg_GROUND_att \
B_avg_GROUND_landed \
B_avg_HEAD_att \
B_avg_HEAD_landed \
B_avg_KD \
B_avg_LEG_att \
B_avg_LEG_landed \
B_avg_PASS \
B_avg_REV \
B_avg_SIG_STR_att \
B_avg_SIG_STR_landed \
B_avg_SIG_STR_pct \
B_avg_SUB_ATT \
B_avg_TD_att \
B_avg_TD_landed \
R_avg_BODY_att \
R_avg_BODY_landed \
R_avg_CLINCH_att \
R_avg_CLINCH_landed \
R_avg_DISTANCE_att \
R_avg_DISTANCE_landed \
R_avg_GROUND_att \
R_avg_GROUND_landed \
R_avg_HEAD_att \
R_avg_HEAD_landed \
R_avg_KD \
R_avg_LEG_att \
R_avg_LEG_landed \
R_avg_PASS \
R_avg_REV \
R_avg_SIG_STR_att \
R_avg_SIG_STR_landed \
R_avg_SIG_STR_pct \
R_avg_SUB_ATT \
R_avg_TD_att \
R_avg_TD_landed'
features = features.split(' ')
selected_features = raw_data[features]
selected_features = selected_features.dropna()
selected_features['Winner'] = selected_features['Winner'].map(lambda x: 1 if x == 'Red' else 0)

Now that we have grabbed our features and mapped the winner to a numeric value, calculate the average statistics for each fighter, regardless of whether they are in the B or R corner.  We will try to use these averages as the the inputs to our model during training.

In [None]:
R_features = [x for x in features if 'R_' == x[:2]]
B_features = [x for x in features if 'B_' == x[:2]]
R_fighters = pd.DataFrame(selected_features[R_features])
B_fighters = pd.DataFrame(selected_features[B_features])
B_fighters.columns = R_fighters.columns
Fighters = pd.concat([R_fighters, B_fighters], ignore_index=True, sort=False)


We now attempt to perform a logistic regression on the data (excluding R_fighter, and B_fighter columns) to try and predict the winner of a fight based on their stats.  Additionally will try random forest and adaboost classifiers while we have the data wrangled.

In [None]:
from sklearn.model_selection import train_test_split

train_data = selected_features.loc[:, selected_features.columns != 'R_fighter']
train_data = train_data.loc[:, train_data.columns != 'B_fighter']
x_torch = train_data.loc[:, train_data.columns != 'Winner']
y_torch = train_data.loc[:, train_data.columns == 'Winner']
x_train, x_test, y_train, y_test = train_test_split(x_torch, y_torch, test_size=0.3, random_state=0)

print('SCORE')
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', penalty='l2')
model.fit(x_train, y_train.values.ravel())
print('Logistic Regression with L2:', model.score(x_test, y_test.values.ravel()))


from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train.values.ravel())
print('Random Forest:', forest.score(x_test, y_test.values.ravel()))

from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()
ab.fit(x_train, y_train.values.ravel())
print('AdaBoost:', ab.score(x_test, y_test.values.ravel()))



As an attempt at generalizing the solution I tried normalization with L1 and L2 norms, which had almost no effect.  

Next I will try a simple neural netowrk, a feed-forward neural net with a sigmoid output.  This will give a score between 0 and 1 predicting if red corner wins.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class UFCData(Dataset):
    def __init__(self, fighters, winner):
        self.fighters = fighters
        self.winner = winner
    def __len__(self):
        return self.winner.shape[0]
    def __getitem__(self, idx):
        inputs = self.fighters[idx]
        labels = self.winner[idx].item()
        return inputs, labels

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(42,64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,1)

    def forward(self, x):

        out = self.fc1(x.float())
        out = torch.tanh(out)
        out = self.fc2(out)
        out = torch.tanh(out)
        out = self.fc3(out)
        out = torch.sigmoid(out)
        return out

    
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

ufc_train_data = UFCData(x_train_scaled, y_train.to_numpy())
ufc_test_data = UFCData(x_test_scaled,y_test.to_numpy())
train_loader = DataLoader(ufc_train_data, batch_size=100)
test_loader = DataLoader(ufc_test_data, batch_size=100)
model = Network()
optimizer = optim.Rprop(model.parameters())
criterion = nn.BCELoss()

epochs=5
for epoch in range(epochs):
    for i, data in enumerate(train_loader):
        inputs = data[0]
        labels = data[1]
        labels = labels.float().view(-1,1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1,1))
        loss.backward()
        optimizer.step()
  
        
 

In [None]:

test_loss = 0.0
incorrect_pred = 0
for i, data in enumerate(test_loader):
    inputs = data[0]
    labels = data[1]
    outputs = model(inputs)
    test_loss += criterion(outputs, labels.float().view(-1,1))
    outputs = outputs.detach().view(-1)
    for i, out in enumerate(outputs):
        outputs[i] = 1 if out > 0.5 else 0
    incorrect_pred += torch.sum(torch.abs(outputs-labels.float()))
    
print("incorrect predictions:", incorrect_pred.item(), "out of:", len(ufc_test_data))
print("accuracy: {:.2f}%".format((len(ufc_test_data) - incorrect_pred.item())/len(ufc_test_data)*100))



The dirty mess of code below is used to predict covington vs usman. predicts covington 66%, not good considering covington lost the fight.

In [None]:


valfeat = B_features.copy()
valfeat.remove('B_fighter')
colby = selected_features.loc[selected_features['B_fighter']=='Colby Covington'][valfeat].iloc[0]
kamaru = selected_features.loc[selected_features['B_fighter']=='Kamaru Usman'][valfeat].iloc[0]
validation_df = pd.concat([colby, kamaru], ignore_index=True, sort=False)
validation = scaler.transform(validation_df.to_numpy().reshape(1,-1))
validation = torch.Tensor(validation)
winner = model(validation)
print(winner)