# 1. Info

This notebooks contains the feature importance analysis.

Before runing this notebook you should have run the "__data_preparation.ipynb__" notebook.

# 2. Feature importance analysis.

## 2.1. Import Libraries

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from IPython.display import display

## 2.2. Read the data

In [4]:
data = pd.read_csv('../data/enriched_data/premier_league.csv')

In [5]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

## 2.3. Setting up the validation framework

In [6]:
categorical_variables = ['team','opponent','season','home']
numerical_variables = ['ftg_scored_total','ftg_received_total','htg_scored_total','htg_received_total','shots_total','shots_received_total','shots_target_total','shots_target_received_total','fouls_commited_total','fouls_received_total','corners_total','corners_against_total','yellow_cards_total','yellow_cards_opponent_total','red_cards_total','red_cards_opponent_total','points','goal_difference','position','win_rate','mooving_win_rate','mooving_goals_scored','mooving_goals_received']
# target_variable = ['ftg_scored','ftg_received','ft_result','win']

In [8]:
df = data[categorical_variables + numerical_variables + ['win']].copy()

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

print(len(df_train))
print(len(df_test))
print(len(df_val))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.win.values
y_val = df_val.win.values
y_test = df_test.win.values

del df_train['win']
del df_val['win']
del df_test['win']

10546
3516
3516


## 2.4 difference and risk ratio

In [17]:
global_win = df_full_train.win.mean()

In [21]:
for cat in categorical_variables:
    print(cat)
    df_group = df_full_train.groupby(cat).win.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_win
    df_group['risk_ratio'] = df_group['mean'] / global_win
    display(df_group)
    print()
    print()

team


Unnamed: 0_level_0,mean,count,diff,risk_ratio
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.21831,426,-0.158024,0.580097
1,0.618705,695,0.242372,1.644034
2,0.295316,491,-0.081018,0.784718
3,0.376812,69,0.000478,1.001271
4,0.206573,213,-0.169761,0.548909
5,0.216667,60,-0.159667,0.575731
6,0.321951,205,-0.054382,0.855495
7,0.348416,221,-0.027917,0.925818
8,0.291262,206,-0.085071,0.773947
9,0.568214,711,0.19188,1.509868




opponent


Unnamed: 0_level_0,mean,count,diff,risk_ratio
opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.472093,430,0.09576,1.254454
1,0.184922,703,-0.191412,0.491378
2,0.455466,494,0.079132,1.210272
3,0.418182,55,0.041848,1.1112
4,0.544601,213,0.168268,1.447124
5,0.587302,63,0.210968,1.560589
6,0.401961,204,0.025627,1.068098
7,0.43379,219,0.057457,1.152675
8,0.447115,208,0.070782,1.188083
9,0.202817,710,-0.173516,0.538929




season


Unnamed: 0_level_0,mean,count,diff,risk_ratio
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-2001,0.364238,604,-0.012095,0.967861
2001-2002,0.373183,619,-0.003151,0.991628
2002-2003,0.382496,617,0.006163,1.016375
2003-2004,0.358333,600,-0.018,0.95217
2004-2005,0.355738,610,-0.020596,0.945273
2005-2006,0.399671,608,0.023338,1.062013
2006-2007,0.372287,599,-0.004046,0.989248
2007-2008,0.379139,604,0.002806,1.007455
2008-2009,0.369775,622,-0.006558,0.982573
2009-2010,0.371941,613,-0.004392,0.988329




home


Unnamed: 0_level_0,mean,count,diff,risk_ratio
home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.291912,6961,-0.084421,0.775674
1,0.45909,7101,0.082757,1.219903






## 2.5. Mutual information (Categorical variables)

In [23]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.win)

In [24]:
mi = df_full_train[categorical_variables].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

team        0.034807
opponent    0.027517
home        0.014987
season      0.000718
dtype: float64

## 2.6 Correlation (Numerical variables)

In [31]:
numerical_corr = df_full_train[numerical_variables].corrwith(df_full_train.win).abs().reset_index()
numerical_corr.columns = ['features','corr']
numerical_corr.sort_values(by=['corr'], ascending=False)

Unnamed: 0,features,corr
20,mooving_win_rate,0.485797
17,goal_difference,0.216905
18,position,0.193018
19,win_rate,0.173479
16,points,0.126763
0,ftg_scored_total,0.12297
2,htg_scored_total,0.120665
21,mooving_goals_scored,0.11694
6,shots_target_total,0.083519
4,shots_total,0.082223


In [36]:
numerical_corr.query(f"corr > {numerical_corr['corr'].mean()}")['features'].values

array(['ftg_scored_total', 'htg_scored_total', 'points',
       'goal_difference', 'position', 'win_rate', 'mooving_win_rate',
       'mooving_goals_scored'], dtype=object)

End of the notebook