# Examining Collinearity Within the Data

In [1]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('train.csv')
X = df.drop(['outcome'], axis=1)

In [3]:
numeric_columns = X.select_dtypes(include=['number']).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

In [4]:
numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(sparse=False))])

num_X = pd.DataFrame(numeric_pipe.fit_transform(X[numeric_columns]), columns=numeric_columns)

cat_X = categorical_pipe.fit_transform(X[categorical_columns])
onehot_col_names = categorical_pipe.named_steps['ohe'].get_feature_names_out(input_features=categorical_columns)
cat_X = pd.DataFrame(cat_X, columns=onehot_col_names)

X = pd.merge(num_X, cat_X, left_index=True, right_index=True)

In [5]:
# calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

pd.set_option('display.max_rows', None)
print(vif_data)
pd.reset_option('display.max_rows')

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


                                   feature           VIF
0                                     r_kd  1.714326e+00
1                                r_sub.att  1.664085e+00
2                                   r_rev.  1.284558e+00
3                                 r_ctrl_s  5.152235e+00
4                        r_sig_str_percent  1.698390e+01
5                            r_sig_str_att           inf
6                      r_total_str_percent  8.610538e+00
7                          r_total_str_att  2.138108e+01
8                             r_td_percent  1.285973e+00
9                                 r_td_att  2.330423e+00
10                          r_head_percent  8.989691e+00
11                              r_head_att           inf
12                          r_body_percent  1.643944e+00
13                              r_body_att           inf
14                           r_leg_percent  1.345170e+00
15                               r_leg_att           inf
16                      r_dista

# Examining Favourite Win Rate by Year

In [6]:
df = pd.read_csv('train.csv')
df['date'] = pd.to_datetime(df['date']*10**11)
df['year'] = df['date'].dt.year

fav_win_rates = df[df['outcome'] == 'R'].groupby('year').size()/df.groupby('year').size()

In [7]:
fav_win_rates

year
2001    1.000000
2002    1.000000
2003    1.000000
2004    1.000000
2005    1.000000
2006    1.000000
2007    1.000000
2008    1.000000
2009    1.000000
2010    0.612000
2011    0.600671
2012    0.622754
2013    0.593668
2014    0.631579
2015    0.571121
2016    0.593429
2017    0.556054
2018    0.541578
2019    0.572835
2020    0.612108
2021    0.567460
2022    0.595285
2023    0.532319
dtype: float64

In [9]:
df[df['outcome'] == 'B']

Unnamed: 0,outcome,weightclass,time_format,title,r_kd,r_sub.att,r_rev.,r_ctrl_s,r_sig_str_percent,r_sig_str_att,...,date,r_height,r_reach,r_stance,r_dob,b_height,b_reach,b_stance,b_dob,year
1,B,Lightweight,3 Rnd (5-5-5),False,0.166667,0.000000,0.000000,195.666667,0.426600,132.833333,...,2023-07-01,70.0,74.0,Orthodox,6655392.0,70.0,72.0,Switch,7617024.0,2023
2,B,Welterweight,3 Rnd (5-5-5),False,0.571429,0.285714,0.214286,132.500000,0.502677,106.714286,...,2023-07-01,71.0,76.0,Orthodox,5020704.0,72.0,79.0,Orthodox,9301824.0,2023
4,B,Lightweight,3 Rnd (5-5-5),False,1.000000,0.000000,0.000000,47.000000,0.693878,49.000000,...,2023-07-01,68.0,71.0,Orthodox,8201088.0,71.0,73.0,Southpaw,8192448.0,2023
5,B,Middleweight,3 Rnd (5-5-5),False,1.000000,0.000000,0.000000,1.000000,0.488889,45.000000,...,2023-07-01,70.0,72.0,Orthodox,7208352.0,77.0,76.0,Orthodox,7536672.0,2023
6,B,Welterweight,3 Rnd (5-5-5),False,0.055556,0.611111,0.111111,298.555556,0.422748,110.388889,...,2023-07-01,69.0,77.0,Orthodox,7155648.0,72.0,74.0,Orthodox,6860160.0,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5798,B,Light Heavyweight,3 Rnd (5-5-5),False,0.000000,2.000000,0.000000,359.000000,0.428571,35.000000,...,2010-03-27,69.0,72.0,Switch,3751488.0,75.0,75.0,Orthodox,3843072.0,2010
5800,B,Light Heavyweight,3 Rnd (5-5-5),False,0.636364,0.090909,0.000000,94.363636,0.549593,55.909091,...,2010-03-21,75.0,78.0,Orthodox,2452896.0,76.0,84.0,Orthodox,5536512.0,2010
5805,B,Light Heavyweight,3 Rnd (5-5-5),False,0.000000,0.666667,0.000000,29.666667,0.273649,98.666667,...,2010-03-21,74.0,77.0,Orthodox,3317760.0,72.0,74.0,Orthodox,317952.0,2010
5806,B,Lightweight,3 Rnd (5-5-5),False,0.333333,0.000000,0.000000,89.666667,0.482353,28.333333,...,2010-03-21,70.0,70.0,Orthodox,2710368.0,70.0,71.0,Orthodox,4535136.0,2010


Before 2010-03-21, the data shows the winner of each fight as being in the red corner. This is not the case in actuality. See Matt Serra VS Georges St. Pierre.

Because of this, I shouldn't use R, B to set the baseline.

One option is to calculate the baseline based on results after this is changed.

(This may be why date is such an important feature in XGBoost.)

In [8]:
modern_df = df[df['date'] >= pd.to_datetime('2010-03-21')].copy()
y = pd.get_dummies(modern_df['outcome'], drop_first=True)
y = np.ravel(y).reshape((-1,))

fav_win_rate_modern = sum(y)/len(y)
fav_win_rate_modern

0.5826880055067974

The win rate after 2010 is seen above