In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

%matplotlib inline

In [2]:

TABLE_FOLDER = '../../src/data'
RAW_FILE = 'modeling_v1_data.csv'

data = pd.read_csv(f'{TABLE_FOLDER}/{RAW_FILE}')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.shape

(8179, 10)

In [3]:
data.head()

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2,winner
0,LWWDL,DWWWW,9.0,12.0,8.0,3.0,3.6,3.6,1.7,2.0
1,LLLWL,LDLLW,11.0,5.0,5.0,11.0,1.79,3.3,3.8,1.0
2,WLLWL,WLWWW,7.0,8.0,5.0,4.0,3.5,3.3,1.96,1.0
3,LWLLL,WWDLL,12.0,7.0,7.0,7.0,2.5,3.1,2.5,1.0
4,WLWWW,WLLLL,4.0,5.0,13.0,10.0,1.27,4.5,9.99,0.0


In [4]:
X = data.drop('winner', axis=1)
y = data.winner
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [6]:
data.dtypes

last_5_home       object
last_5_away       object
last_h_goals     float64
last_a_goals     float64
last_wh_goals    float64
last_wa_goals    float64
odd_1            float64
odd_N            float64
odd_2            float64
winner           float64
dtype: object

In [7]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(['winner'], axis=1).columns
categorical_features = data.select_dtypes(include=['object']).columns

numeric_features, categorical_features

(Index(['last_h_goals', 'last_a_goals', 'last_wh_goals', 'last_wa_goals',
        'odd_1', 'odd_N', 'odd_2'],
       dtype='object'),
 Index(['last_5_home', 'last_5_away'], dtype='object'))

In [8]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [10]:
rf.fit(X_train, y_train)

In [11]:
y_pred = rf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.493
SVC(C=0.025, probability=True)
model score: 0.531
NuSVC(probability=True)
model score: 0.484
DecisionTreeClassifier()
model score: 0.463
RandomForestClassifier()
model score: 0.537
AdaBoostClassifier()
model score: 0.529
GradientBoostingClassifier()
model score: 0.536


In [13]:
param_grid = { 
    'classifier__n_estimators': [100],
    'classifier__max_depth' : [26,28,22]}

clf = RandomForestClassifier()
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                    ('classifier', clf)])
from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(pipe, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)

{'classifier__max_depth': 26, 'classifier__n_estimators': 100}
0.5431739334599561


In [14]:
row = data.sample(1).drop('winner',axis=1)

CV.predict(row)

array([0.])

In [15]:
row.values

array([['LLWWD', 'LLDWW', 7.0, 4.0, 6.0, 6.0, 1.37, 4.59, 9.04]],
      dtype=object)

In [16]:
row

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
4365,LLWWD,LLDWW,7.0,4.0,6.0,6.0,1.37,4.59,9.04


In [17]:
X_test

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
5737,LLLDD,DWLLD,7.0,6.0,3.0,6.0,2.46,3.20,3.11
3988,DWWWD,WDLWL,4.0,5.0,14.0,9.0,1.25,5.57,12.15
6678,DLWLL,LWLWW,9.0,7.0,9.0,2.0,5.74,4.56,1.56
2011,WWWWW,WWDLL,0.0,6.0,11.0,7.0,1.58,3.49,5.78
3435,DDWLL,DWWLD,6.0,7.0,4.0,7.0,2.25,3.22,3.27
...,...,...,...,...,...,...,...,...,...
679,LWLDW,LLWLL,6.0,5.0,9.0,14.0,1.35,4.00,4.00
6975,LLLDD,WLLLL,12.0,7.0,3.0,10.0,2.47,3.08,3.27
5370,LLLLL,WWDLW,11.0,6.0,4.0,6.0,3.34,3.42,2.21
4592,WWDWD,WWWLD,5.0,13.0,8.0,4.0,3.02,3.27,2.38


In [1]:
r=pd.DataFrame([['WLLLD', 'WDWWW', 4, 2, 15, 4, 2.87, 3.41, 2.92]])
r.columns=['last_5_home','last_5_away','last_h_goals','last_a_goals',
            'last_wh_goals','last_wa_goals','odd_1','odd_N','odd_2']
r

<IPython.core.display.Javascript object>

Unnamed: 0,last_5_home,last_5_away,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2
0,WLLLD,WDWWW,4,2,15,4,2.87,3.41,2.92


In [2]:
CV.predict_proba(r), CV.predict(r)

NameError: name 'CV' is not defined

In [21]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(CV, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)

In [22]:
result

0.5342298288508558

In [23]:
loaded_model.predict_proba(r)

array([[0.41777782, 0.28342682, 0.29879536]])