In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

%matplotlib inline

In [2]:

TABLE_FOLDER = '../../src/data'
RAW_FILE = 'modeling_v1_data.csv'

data = pd.read_csv(f'{TABLE_FOLDER}/{RAW_FILE}')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.shape

(8179, 10)

In [3]:
data.head()

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2,winner
0,La/Wh/Wa/Dh/La,Dh/Wh/Wa/Wa/Wh,9.0,12.0,8.0,3.0,3.6,3.6,1.7,2.0
1,La/Lh/La/Wh/La,La/Dh/Lh/La/Wh,11.0,5.0,5.0,11.0,1.79,3.3,3.8,1.0
2,Wa/Lh/La/Wh/La,Wh/La/Wh/Wa/Wh,7.0,8.0,5.0,4.0,3.5,3.3,1.96,1.0
3,La/Wh/La/Lh/La,Wh/Wa/Dh/La/Lh,12.0,7.0,7.0,7.0,2.5,3.1,2.5,1.0
4,Wh/La/Wh/Wa/Wa,Wh/La/Lh/La/Lh,4.0,5.0,13.0,10.0,1.27,4.5,9.99,0.0


In [4]:
X = data.drop('winner', axis=1)
y = data.winner
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [6]:
data.dtypes

last_5_home       object
last_5_away       object
last_h_goals     float64
last_a_goals     float64
last_wh_goals    float64
last_wa_goals    float64
odd_1            float64
odd_N            float64
odd_2            float64
winner           float64
dtype: object

In [7]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(['winner'], axis=1).columns
categorical_features = data.select_dtypes(include=['object']).columns

numeric_features, categorical_features

(Index(['last_h_goals', 'last_a_goals', 'last_wh_goals', 'last_wa_goals',
        'odd_1', 'odd_N', 'odd_2'],
       dtype='object'),
 Index(['last_5_home', 'last_5_away'], dtype='object'))

In [8]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
# inster the code for a custom transform for the dataset
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import random
class ComputeTrend(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.initialization='OK'

    def compute(result5):
        return random.randint

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for idx, r in X_.iterrows():
            try:
                l = r.last_5_home.split('/')
                l = list(map(lambda x: x.replace('Wa','7'),l))
                l = list(map(lambda x: x.replace('Da','4'),l))
                l = list(map(lambda x: x.replace('Wh','3'),l))
                l = list(map(lambda x: x.replace('Dh','1'),l))
                l = list(map(lambda x: x.replace('Lh','-2'),l))
                l = list(map(lambda x: x.replace('La','0'),l))
                l = pd.DataFrame(l)
                l.columns=['trend']
                s=np.arange(1,6)
                model=np.polyfit(s,l.trend.astype(int).cumsum(),1)
                X_.loc[idx,'home_trend_slope']=model[0]
                X_.loc[idx,'home_trend_intercept']=model[1]

                l = r.last_5_away.split('/')
                l = list(map(lambda x: x.replace('Wa','7'),l))
                l = list(map(lambda x: x.replace('Da','4'),l))
                l = list(map(lambda x: x.replace('Wh','3'),l))
                l = list(map(lambda x: x.replace('Dh','1'),l))
                l = list(map(lambda x: x.replace('Lh','-2'),l))
                l = list(map(lambda x: x.replace('La','0'),l))
                l = pd.DataFrame(l)
                l.columns=['trend']
                s=np.arange(1,6)
                model=np.polyfit(s,l.trend.astype(int).cumsum(),1)
                X_.loc[idx,'away_trend_slope']=model[0]
                X_.loc[idx,'away_trend_intercept']=model[1]
            except:
                X_.loc[idx,'home_trend_slope']=0
                X_.loc[idx,'home_trend_intercept']=0
                X_.loc[idx,'away_trend_slope']=0
                X_.loc[idx,'away_trend_intercept']=0


            
        X_ = X_.drop(['last_5_home','last_5_away'], axis=1)
        #print(X_.iloc[0,:])
        return X_



    

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[ #('preprocessor', preprocessor),
                    ('preprocessor', ComputeTrend()),
                    ('scaler', StandardScaler()),
                      ('classifier', RandomForestClassifier())])

In [11]:
rf.fit(X_train, y_train)


Pipeline(steps=[('preprocessor', ComputeTrend()), ('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier())])

In [12]:
X_train

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
704,Lh/Da/Wa/Dh/Da,Dh/La/La/Wh/Dh,8.0,7.0,9.0,8.0,1.48,3.96,5.90
4722,Da/Wh/Da/Dh/La,Lh/Wa/Wh/Wa/Wh,7.0,14.0,6.0,8.0,4.39,3.89,1.75
7874,Lh/La/Wa/La/Lh,Wh/Wa/La/Wh/Wa,13.0,12.0,4.0,4.0,2.95,3.31,2.50
74,Wh/Wa/Wa/Wh/Wh,Wa/La/Wh/Wh/La,0.0,7.0,6.0,4.0,2.35,3.10,3.60
3710,Wa/Wh/Da/Lh/La,La/Dh/Wh/La/Dh,7.0,8.0,7.0,8.0,2.10,3.20,3.69
...,...,...,...,...,...,...,...,...,...
2342,Da/Da/Dh/Wh/La,Lh/La/La/Dh/Da,9.0,2.0,6.0,6.0,1.81,3.31,4.44
2275,Wh/La/Dh/Wa/Wh,Wa/Wh/Wa/Wh/La,3.0,13.0,9.0,3.0,2.61,3.04,2.70
1392,Da/Da/Wh/Da/Wh,Lh/Da/Lh/Da/Wh,3.0,12.0,8.0,11.0,2.16,3.05,3.09
7655,La/Wh/La/La/Wh,Wh/Wa/Dh/Wa/La,7.0,8.0,8.0,2.0,1.80,3.99,4.23


In [13]:
y_pred = rf.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', ComputeTrend()),
                    ('scaler', StandardScaler()),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.465
SVC(C=0.025, probability=True)
model score: 0.519
NuSVC(probability=True)
model score: 0.463
DecisionTreeClassifier()
model score: 0.433
RandomForestClassifier()
model score: 0.516
AdaBoostClassifier()
model score: 0.526
GradientBoostingClassifier()
model score: 0.534


In [15]:
param_grid = { 
    'classifier__n_estimators': [100],
    'classifier__max_depth' : [26,28,22]}

clf = RandomForestClassifier()
pipe = Pipeline(steps=[('preprocessor', ComputeTrend()),
                    ('scaler', StandardScaler()),
                    ('classifier', clf)])
from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(pipe, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)

{'classifier__max_depth': 26, 'classifier__n_estimators': 100}
0.5207064477167014


In [17]:
row = data.sample(1).drop('winner',axis=1)

CV.predict(row)

array([1.])

In [18]:
row.values

array([['La/Lh/Wa/Lh/Dh', 'Lh/La/Lh/La/Dh', 8.0, 3.0, 4.0, 14.0, 2.24,
        3.32, 3.46]], dtype=object)

In [19]:
row

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
5799,La/Lh/Wa/Lh/Dh,Lh/La/Lh/La/Dh,8.0,3.0,4.0,14.0,2.24,3.32,3.46


In [20]:
X_test

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
5759,La/Dh/La/Lh/Wa,Lh/Da/Dh/Wh/Da,6.0,7.0,3.0,5.0,2.72,3.27,2.72
6528,Wh/Da/Wa/Wh/Wa,Da/Lh/La/Lh/La,1.0,3.0,8.0,11.0,1.24,6.33,14.27
8140,Wa/Wh/Wh/La/Wh,Dh/Da/Dh/La/La,1.0,4.0,16.0,9.0,1.17,8.08,16.20
3599,Wa/Dh/Wa/Lh/La,Wh/La/Lh/Wh/Wa,8.0,10.0,8.0,8.0,3.61,3.42,2.01
5942,Wa/Lh/Wa/Wa/Dh,Dh/La/Lh/Wa/Wh,5.0,6.0,7.0,8.0,1.59,4.17,6.12
...,...,...,...,...,...,...,...,...,...
2627,Dh/Da/Wh/La/Wh,Lh/La/Lh/Wa/Dh,2.0,4.0,3.0,8.0,2.24,3.21,3.23
3018,Da/Wh/Da/Wh/La,Wh/Da/Dh/La/Wh,1.0,7.0,5.0,6.0,1.50,3.80,7.42
1193,La/Lh/Da/Dh/Wh,Lh/Da/Wh/La/Wh,8.0,6.0,4.0,7.0,2.46,3.08,2.63
5987,La/La/Lh/Da/Wh,Dh/Wh/Da/Wh/Wh,5.0,11.0,3.0,6.0,3.46,3.29,2.26


In [21]:
r=pd.DataFrame([['La/Dh/Da/Wh/La', 'Lh/La/Lh/Wa/Dh', 4, 2, 15, 4, 2.87, 3.41, 2.92]])
r.columns=['last_5_home','last_5_away','last_h_goals','last_a_goals',
            'last_wh_goals','last_wa_goals','odd_1','odd_N','odd_2']
r

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
0,La/Dh/Da/Wh/La,Lh/La/Lh/Wa/Dh,4,2,15,4,2.87,3.41,2.92


In [22]:
CV.predict_proba(r), CV.predict(r)

(array([[0.32, 0.3 , 0.38]]), array([2.]))

In [23]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(CV, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)

In [24]:
result

0.5128361858190709

In [25]:
loaded_model.predict_proba(r)

array([[0.32, 0.3 , 0.38]])

In [26]:
l = (['Wa','Da','Wh','Wh','Da'])
l = list(map(lambda x: x.replace('Wa','7'),l))
l = list(map(lambda x: x.replace('Da','4'),l))
l = list(map(lambda x: x.replace('Wh','3'),l))
l = list(map(lambda x: x.replace('Dh','1'),l))
l = list(map(lambda x: x.replace('Lh','-2'),l))
l = list(map(lambda x: x.replace('La','0'),l))
l = pd.DataFrame(l)
l.columns=['trend']
s=np.arange(1,6)
model=np.polyfit(s,l.trend.astype(int).cumsum(),1)
print(model)


[3.4 3.8]
