In [1]:
## Exploring algorithms for UFC project
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [47]:
path = os.getcwd()
p = path.removesuffix('ufcPredictionProject')
p = p+'UFCdata.csv'


df = pd.read_csv(p)
print(df.shape)
df.head(3)

(6012, 144)


Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Aalon Cruz,Spike Carlyle,Scott Howard,2/29/2020,"Norfolk, Virginia, USA",Blue,False,Featherweight,,,...,0,0,0,0,Switch,182.88,198.12,145.0,26.0,30.0
1,Aaron Phillips,Matt Hobar,Kevin Nix,8/23/2014,"Tulsa, Oklahoma, USA",Blue,False,Bantamweight,0.0,0.0,...,0,0,0,0,Southpaw,175.26,180.34,135.0,27.0,25.0
2,Aaron Riley,Justin Salas,Randy Corley,7/27/2013,"Seattle, Washington, USA",Blue,False,Lightweight,0.0,1.0,...,3,0,0,0,Southpaw,172.72,175.26,155.0,31.0,32.0


In [48]:
# removing features that are anticipated to not be significant
df = df.drop(['R_fighter','B_fighter','Referee','location'],axis=1)

# removing rows that contain NaN
df.dropna(inplace=True)
print(df.shape)
df.head(3)
print(df.isnull().sum().sum())

(3914, 140)
0


In [67]:
# test train split
(df_train,df_test) = train_test_split(df, train_size = 0.8)

X_train = df_train.drop('Winner',axis=1)
y_train = df_train.Winner
X_test = df_test.drop('Winner',axis=1)
y_test = df_test.Winner

X = X_train
y = y_train

print(X.isnull().sum().sum())

0


In [68]:
# make time a continuous feature
X.date = pd.to_datetime(X.date)
X.date = pd.to_numeric(X.date)

In [69]:
# split features into numerical and categorical
X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')

In [70]:
# one hot encode
X_cat = pd.get_dummies(X_cat)

In [72]:
# finding columns that are all zeros from X_num
cols = pd.Series(X_num.columns)
for i in range(len(cols)):
    if (X_num.iloc[:,i] == 0).all():
        print(cols[i])

B_draw
R_draw


In [73]:
# removing columns of zeros
X_num = X_num.drop(['B_draw', 'R_draw'], axis=1)

In [74]:
# standardize
X_num = (X_num - X_num.mean())/X_num.std()

In [75]:
# put back together
X = pd.concat([X_num,X_cat], axis=1)

In [76]:
X_num.head()

Unnamed: 0,date,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
606,1.34031,-0.691349,-0.564623,-0.248648,1.429159,0.92393,-0.018191,-0.660515,0.223678,-0.462577,...,0.835616,-0.370054,-0.838396,-0.673232,-0.276159,-0.950946,-0.294774,-0.39061,-1.453669,-0.537603
1149,-0.196897,-0.032735,-0.355297,0.628703,-1.825607,1.102611,-0.923206,-0.437649,3.011525,-0.41437,...,-0.589392,-0.370054,1.108946,-0.673232,-0.276159,-0.666923,-0.532277,-0.680826,-0.456258,-0.057844
4954,-0.550549,0.585585,2.733073,-0.627371,1.553506,-0.169568,0.619264,-0.660515,0.223678,1.053412,...,0.835616,-0.370054,-0.838396,-0.673232,-0.276159,0.753191,0.655238,0.480038,-0.705611,-1.49712
1671,0.350085,-0.503675,-0.558182,-1.07353,-0.08075,1.364573,-0.985623,-0.660515,-0.590888,-0.460727,...,-0.589392,1.259538,1.595782,1.294306,-0.276159,-0.3829,-0.057271,-0.39061,0.042448,-0.777482
4211,1.113973,-0.691349,-0.564623,-1.258576,-0.270245,0.773103,-0.356839,-0.660515,-0.628766,-0.462577,...,-0.589392,-0.913252,0.135275,-0.673232,-0.276159,0.185146,-0.532277,0.044714,-0.954963,1.141553


In [85]:
# logistic regression
logr = LogisticRegression(random_state=0, max_iter=1000)
logr.fit(X,y)


LogisticRegression(max_iter=1000, random_state=0)

In [89]:
# Training score
print('Training R2:',logr.score(X,y).round(2))

Training R2: 0.68


In [104]:
coef = pd.Series(logr.coef_)
coef.sort_values(ascending=False)

ValueError: Data must be 1-dimensional

## Finding Test Error

In [90]:
X = X_test
y = y_test

In [91]:
# make time a continuous feature
X.date = pd.to_datetime(X.date)
X.date = pd.to_numeric(X.date)

In [92]:
# split features into numerical and categorical
X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')

In [93]:
# one hot encode
X_cat = pd.get_dummies(X_cat)

In [94]:
# removing columns of zeros
X_num = X_num.drop(['B_draw', 'R_draw'], axis=1)

In [95]:
# standardize
X_num = (X_num - X_num.mean())/X_num.std()

In [96]:
# put back together
X = pd.concat([X_num,X_cat], axis=1)

In [97]:
# logistic regression
logr = LogisticRegression(random_state=0, max_iter=1000)
logr.fit(X,y)

LogisticRegression(max_iter=1000, random_state=0)

In [98]:
# Test score
print('Test R2:',logr.score(X,y).round(2))

Test R2: 0.75


In [105]:
logr.coef_?