# 1. Info

Notebook for tuning the model parameters. The parameters tu be tune  are:

    * eta
    * max_depth
    * min_child_weight

Before running this notebook you should have run the "__data_preparation.ipynb__" notebook. To understand why the Xboost model is being tuned check the "__model_selection.ipynb__" notebook.

# 2. Parameter Tunning

## 2.1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

## 2.2. Read the data

In [2]:
data = pd.read_csv('../data/enriched_data/premier_league.csv')

## 2.3 Prepare the data

In [3]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [4]:
most_meaningful_features = ['ftg_scored_total','htg_scored_total','points','goal_difference','position','win_rate','mooving_win_rate','mooving_goals_scored']

In [5]:
df = data[most_meaningful_features + ['win']].copy()

In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=7)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=7)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.win.values
y_val = df_val.win.values
y_test = df_test.win.values

del df_train['win']
del df_val['win']
del df_test['win']

In [7]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[most_meaningful_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[most_meaningful_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[most_meaningful_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

## 2.3 Parameter tuning

In [8]:
def model_accuracy(model, x_data, y_data):
    y_pred = model.predict(x_data)
    thresholds = np.linspace(0, 1, 21)
    scores = []
    thr = []
    for t in thresholds:
        score = accuracy_score(y_data, y_pred >= t)
        scores.append(score)
        thr.append(t)
    return pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(1)

In [9]:
features = dv.feature_names_
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

### 2.3.1. eta

In [10]:
historical_accuracy_df = pd.DataFrame({'':[]})

for eta in np.linspace(0.1, 1.5, 15):
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,
        
        'objective': 'binary:logistic',
        'nthread': 8,
        
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100)

    accuracy_df = model_accuracy(model, dval, y_val) 
    accuracy_df['eta'] = eta
    historical_accuracy_df = accuracy_df if historical_accuracy_df.shape[0] == 0 else pd.concat([historical_accuracy_df,accuracy_df])

In [11]:
historical_accuracy_df.sort_values(by=['accuracy_score'], ascending=False).head(5)

Unnamed: 0,threshold,accuracy_score,eta
11,0.55,0.784699,0.2
10,0.5,0.782139,0.1
10,0.5,0.78157,0.3
8,0.4,0.78157,0.6
10,0.5,0.778441,0.5


Since the difference between the accuracy for the eta 1.0 (0.7778) and the eta 0.3 (0.7775) both eta will be considered for the next parameter.

### 2.3.2 max_depth

Select the value when the model performance on the validation set stops improving.

In [14]:
historical_accuracy_df = pd.DataFrame({'':[]})

for eta in [1,0.3]:

    best_accuracy = 0

    for max_depth in range(3,11,1):

        xgb_params = {
            'eta': eta, 
            'max_depth': max_depth,
            'min_child_weight': 1,
            
            'objective': 'binary:logistic',
            'nthread': 8,
            
            'seed': 1,
            'verbosity': 1,
        }
        model = xgb.train(xgb_params, dtrain, num_boost_round=100)

        accuracy_df = model_accuracy(model, dval, y_val) 
        accuracy_df['eta'] = eta
        accuracy_df['max_depth'] = max_depth

        historical_accuracy_df = accuracy_df if historical_accuracy_df.shape[0] == 0 else pd.concat([historical_accuracy_df,accuracy_df])

        # check if the model stop improving
        accuracy_value = accuracy_df.reset_index(drop=True)['accuracy_score'][0]
        if accuracy_value <= best_accuracy:
            break
        best_accuracy = accuracy_value


In [15]:
historical_accuracy_df

Unnamed: 0,threshold,accuracy_score,eta,max_depth
10,0.5,0.77901,1.0,3
12,0.6,0.775313,1.0,4
11,0.55,0.781286,0.3,3
10,0.5,0.795506,0.3,4
10,0.5,0.784699,0.3,5


The eta (0.3) and the max_depth (4) will be the parameter values to keep using.

### 2.3.3 min_child_weight

In [16]:
historical_accuracy_df = pd.DataFrame({'':[]})

for min_child in np.linspace(1, 10, 10):
    xgb_params = {
        'eta': 0.3, 
        'max_depth': 4,
        'min_child_weight': min_child,
        
        'objective': 'binary:logistic',
        'nthread': 8,
        
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100)

    accuracy_df = model_accuracy(model, dval, y_val) 
    accuracy_df['min_child_weight'] = min_child
    historical_accuracy_df = accuracy_df if historical_accuracy_df.shape[0] == 0 else pd.concat([historical_accuracy_df,accuracy_df])

In [18]:
historical_accuracy_df.sort_values(by=['accuracy_score'], ascending=False).head(5)

Unnamed: 0,threshold,accuracy_score,min_child_weight
10,0.5,0.795506,1.0
10,0.5,0.787543,6.0
9,0.45,0.786689,4.0
11,0.55,0.786121,9.0
10,0.5,0.784699,5.0


# 3. Best parameters

The parameters that will be use are:

    * eta = 0.3
    * max_depth = 4
    * min_child_weight = 1
    * threshold = 0.5

End of the notebook