In [26]:
## Exploring algorithms for UFC project
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [27]:
path = os.getcwd()
p = path.removesuffix('ufcPredictionProject')
p = p+'UFCdata.csv'


df = pd.read_csv(p)
print(df.shape)
df.head(3)

(6012, 144)


Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Aalon Cruz,Spike Carlyle,Scott Howard,2/29/2020,"Norfolk, Virginia, USA",Blue,False,Featherweight,,,...,0,0,0,0,Switch,182.88,198.12,145.0,26.0,30.0
1,Aaron Phillips,Matt Hobar,Kevin Nix,8/23/2014,"Tulsa, Oklahoma, USA",Blue,False,Bantamweight,0.0,0.0,...,0,0,0,0,Southpaw,175.26,180.34,135.0,27.0,25.0
2,Aaron Riley,Justin Salas,Randy Corley,7/27/2013,"Seattle, Washington, USA",Blue,False,Lightweight,0.0,1.0,...,3,0,0,0,Southpaw,172.72,175.26,155.0,31.0,32.0


In [28]:
# removing features that are anticipated to not be significant
df = df.drop(['R_fighter','B_fighter','Referee','location'],axis=1)

# removing rows that contain NaN
df.dropna(inplace=True)
print(df.shape)
df.head(3)
print(df.isnull().sum().sum())

(3914, 140)
0


In [29]:
# test train split
(df_train,df_test) = train_test_split(df, train_size = 0.8)

X_train = df_train.drop('Winner',axis=1)
y_train = df_train.Winner
X_test = df_test.drop('Winner',axis=1)
y_test = df_test.Winner

X = X_train
y = y_train

print(X.isnull().sum().sum())

0


In [30]:
# make time a continuous feature
X.date = pd.to_datetime(X.date)
X.date = pd.to_numeric(X.date)

In [31]:
# split features into numerical and categorical
X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')

In [32]:
# one hot encode
X_cat = pd.get_dummies(X_cat)

In [33]:
# finding columns that are all zeros from X_num
cols = pd.Series(X_num.columns)
for i in range(len(cols)):
    if (X_num.iloc[:,i] == 0).all():
        print(cols[i])

B_draw
R_draw


In [34]:
# removing columns of zeros
X_num = X_num.drop(['B_draw', 'R_draw'], axis=1)

In [35]:
# standardize
X_num = (X_num - X_num.mean())/X_num.std()

In [36]:
# put back together
X = pd.concat([X_num,X_cat], axis=1)

In [37]:
X.head()

Unnamed: 0,date,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,weight_class_WomenFlyweight,weight_class_WomenStrawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
4181,0.584418,-0.525448,0.036349,-0.504604,-0.301239,-0.560353,-0.07109,-0.484238,-0.21659,1.406455,...,0,0,0,0,1,0,0,1,0,0
4220,1.090728,-0.689873,-0.565726,-1.409089,0.139743,-1.110965,0.961066,-0.66211,-0.21659,-0.460622,...,1,0,0,1,0,0,0,0,1,0
1920,-0.160981,0.648651,-0.562591,0.775451,-0.583021,-0.371044,-1.001829,-0.064572,-0.62176,-0.460622,...,0,0,0,1,0,0,0,1,0,0
3124,0.884454,1.28323,-0.565726,0.898908,-0.249962,-1.110965,-1.023849,-0.66211,-0.631484,-0.460622,...,0,0,0,0,1,0,0,1,0,0
1146,0.3172,-0.689873,-0.365034,0.883313,2.0575,0.096145,0.638517,-0.484238,0.820645,1.033039,...,0,0,0,1,0,0,0,1,0,0


In [38]:
# logistic regression
logr = LogisticRegression(random_state=0, max_iter=1000)
logr.fit(X,y)


LogisticRegression(max_iter=1000, random_state=0)

In [39]:
# Training score
print('Training R2:',logr.score(X,y).round(2))

Training R2: 0.69


## Finding Test Error

In [40]:
X_train = X
y_train = y

In [41]:
X = X_test
y = y_test

In [42]:
# make time a continuous feature
X.date = pd.to_datetime(X.date)
X.date = pd.to_numeric(X.date)

In [43]:
# split features into numerical and categorical
X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')

In [44]:
# one hot encode
X_cat = pd.get_dummies(X_cat)

In [45]:
# removing columns of zeros
X_num = X_num.drop(['B_draw', 'R_draw'], axis=1)

In [46]:
# standardize
X_num = (X_num - X_num.mean())/X_num.std()

In [47]:
# put back together
X = pd.concat([X_num,X_cat], axis=1)
X.head()

Unnamed: 0,date,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,weight_class_WomenFlyweight,weight_class_WomenStrawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
2013,1.402261,1.855281,-0.54313,0.064579,-0.760399,-1.150728,-0.014444,0.028436,2.843059,1.160344,...,0,0,0,1,0,0,0,0,1,0
1422,0.076009,-0.61575,-0.54313,0.912726,-0.528492,1.101117,-0.567876,-0.341062,-0.497393,1.166907,...,0,0,0,1,0,0,0,0,1,0
2739,0.307577,-0.69546,-0.54313,-0.170113,0.761595,-1.060103,1.677366,-0.248688,0.341968,0.110127,...,0,0,0,1,0,0,0,1,0,0
1053,1.278667,0.619766,-0.109944,-0.343523,0.766064,-0.519722,-1.00735,-0.306422,-0.283305,-0.467492,...,0,0,0,0,0,1,0,1,0,0
5746,-0.328046,2.492966,-0.54313,-0.024082,-1.082162,-0.967029,1.462022,0.905996,-0.201747,-0.520003,...,0,0,0,1,0,0,0,1,0,0


In [48]:
# logistic regression
logr = LogisticRegression(random_state=0, max_iter=1000)
logr.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=0)

In [49]:
# Test score
print('Test R2:',logr.score(X,y).round(2))

Test R2: 0.61


## Feature Selection