# 1. Info.

This notebook is to select the best model.

To understand why some features are used and not all, go to the notebook "__feature_importance.ipynb__". Before running this notebook, you should run the notebook "__data_preparation.ipynb__".

Note: Some cells are commented, those are in case someone wants to test the model selection with all the features. To do it, uncomment those cells and comment on the ones where the variable "most_meaningful_features" is used.

# 2. Select the model

## 2.1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

## 2.2. Read the data

In [2]:
data = pd.read_csv('../data/enriched_data/premier_league.csv')

## 2.3 Prepare the data

In [3]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [4]:
# categorical_variables = ['team','opponent','season','home']
# numerical_variables = ['ftg_scored_total','ftg_received_total','htg_scored_total','htg_received_total','shots_total','shots_received_total','shots_target_total','shots_target_received_total','fouls_commited_total','fouls_received_total','corners_total','corners_against_total','yellow_cards_total','yellow_cards_opponent_total','red_cards_total','red_cards_opponent_total','points','goal_difference','position','win_rate','mooving_win_rate','mooving_goals_scored','mooving_goals_received']

In [5]:
most_meaningful_features = ['ftg_scored_total','htg_scored_total','points','goal_difference','position','win_rate','mooving_win_rate','mooving_goals_scored']

In [6]:
# df = data[categorical_variables + numerical_variables + ['win']].copy()

In [7]:
df = data[most_meaningful_features + ['win']].copy()

## 2.4. split the data

In [8]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.win.values
y_val = df_val.win.values
y_test = df_test.win.values

del df_train['win']
del df_val['win']
del df_test['win']

## 2.5 Vectorize the data

In [9]:
# dv = DictVectorizer(sparse=False)

# train_dict = df_train[categorical_variables + numerical_variables].to_dict(orient='records')
# X_train = dv.fit_transform(train_dict)

# val_dict = df_val[categorical_variables + numerical_variables].to_dict(orient='records')
# X_val = dv.transform(val_dict)

# test_dict = df_test[categorical_variables + numerical_variables].to_dict(orient='records')
# X_test = dv.transform(test_dict)

In [10]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[most_meaningful_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[most_meaningful_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[most_meaningful_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

## 2.6 Train the models

In [11]:
def find_best_threshold(model, data):

    y_pred = model.predict_proba(data)[:, 1]

    thresholds = np.linspace(0, 1, 21)

    scores = []
    thr = []

    for t in thresholds:
        score = accuracy_score(y_val, y_pred >= t)
        scores.append(score)
        thr.append(t)

    return pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(1)

### 2.6.1 Logistic Regression

In [12]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
find_best_threshold(lr_model, X_val)

Unnamed: 0,threshold,accuracy_score
8,0.4,0.790387


In [14]:
find_best_threshold(lr_model, X_test)

Unnamed: 0,threshold,accuracy_score
20,1.0,0.615757


### 2.6.2 Decision Tree Classifier

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [16]:
find_best_threshold(dt, X_val)

Unnamed: 0,threshold,accuracy_score
20,1.0,0.724687


In [17]:
find_best_threshold(dt, X_test)

Unnamed: 0,threshold,accuracy_score
7,0.35,0.529579


### 2.6.3. Random Forest Classifier

In [18]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [19]:
find_best_threshold(rf, X_val)

Unnamed: 0,threshold,accuracy_score
10,0.5,0.768771


In [20]:
find_best_threshold(rf, X_test)

Unnamed: 0,threshold,accuracy_score
20,1.0,0.614334


### 2.6.4 Xgbost

In [21]:
features = dv.feature_names_
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [22]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [23]:
y_pred = model.predict(dval)

thresholds = np.linspace(0, 1, 21)

scores = []
thr = []

for t in thresholds:
    score = accuracy_score(y_val, y_pred >= t)
    scores.append(score)
    thr.append(t)

pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(1)

Unnamed: 0,threshold,accuracy_score
11,0.55,0.783845


In [24]:
y_pred = model.predict(dtest)

thresholds = np.linspace(0, 1, 21)

scores = []
thr = []

for t in thresholds:
    score = accuracy_score(y_test, y_pred >= t)
    scores.append(score)
    thr.append(t)

pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(1)

Unnamed: 0,threshold,accuracy_score
10,0.5,0.779863


# 3. Select the model.

The model with the best accuracy score is the Xboost