# 1. Info

Notebook to train the model.

Before running this model run the notebook "__data_preparation.ipynb__" and to have a better understanding of the model parameters run the notebook "__parameter_tuning.ipynb__".

# 2. Saving the model

In [1]:
# import libraries
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
import pickle

In [2]:
# get the data for the model

data_model = pd.read_csv('../data/enriched_data/premier_league.csv')

data_model.columns = data_model.columns.str.lower().str.replace(' ', '_')

most_meaningful_features = ['ftg_scored_total','htg_scored_total','points','goal_difference','position','win_rate','mooving_win_rate','mooving_goals_scored']

model_df = data_model[most_meaningful_features + ['win']].copy()
y_df = model_df.win.values
del model_df['win']

In [3]:
# vectorize the data
dv = DictVectorizer(sparse=False)

X_dict = model_df[most_meaningful_features].to_dict(orient='records')
X_df = dv.fit_transform(X_dict)

features = dv.feature_names_
dx = xgb.DMatrix(X_df, label=y_df, feature_names=features)

In [4]:
# set the parameters
xgb_params = {
    'eta': 0.3, 
    'max_depth': 4,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [5]:
# train the model
model = xgb.train(xgb_params, dx, num_boost_round=100)

In [7]:
output_model = '../xgboost_model.pkl'
output_vectorizer = '../dv.pkl'

# save model an vectorizer
with open(output_model, "wb") as f:
    pickle.dump(model, f)
    
with open(output_vectorizer, "wb") as f:
    pickle.dump(dv, f)

End of the notebook