In [1]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle
import numpy as np
import pyforest

In [2]:
X, y1 = make_classification(n_samples=10, n_features=6, n_informative=3, n_classes=3, random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
n_samples, n_features = X.shape # 10,100
n_outputs = Y.shape[1] # 3
n_classes = 3

In [3]:
print(X[0])
print(y1[0])

[ 0.12485758 -0.8230969   0.02325242  1.1133852   1.63913742 -0.24407447]
0


In [4]:
data = pd.read_csv('bet.csv')
data.shape

<IPython.core.display.Javascript object>

(44626, 7)

In [5]:
# reduce the dataset for smaller volumne
data = data.sample(6000)

In [6]:
data.shape

(6000, 7)

In [7]:
data.head(5)

Unnamed: 0,Hteam,Ateam,Month,Odd-1,Odd-N,Odd-2,Target
35888,Blackpool,Blackburn,9,3.12,3.29,2.28,0.0
4986,Chelsea,Man United,10,2.4,3.4,2.9,0.0
36250,Stoke,Birmingham,9,2.15,3.35,4.05,2.0
12002,Stoke,Birmingham,9,2.1,3.2,3.2,2.0
26916,Liverpool,West Brom,4,1.44,4.33,7.0,2.0


In [8]:
X = data.drop('Target', axis=1)
y1 = data.Target.values
print(f"Dataset Size = {data.shape}")


Dataset Size = (6000, 7)


In [9]:
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
y = np.vstack((y1, y2, y3)).T

n_outputs = y.shape[1] # 3
n_classes = 3

In [10]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.1, random_state=42, shuffle=False )

In [11]:
categorical_features = ['Hteam','Ateam']
numerical_features = ['Odd-1','Odd-N','Odd-2','Month']

In [12]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [13]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])


In [14]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numerical_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

In [15]:
forest = RandomForestClassifier(random_state=1)
multi_target_forest = OneVsRestClassifier(forest, n_jobs=-1)


In [16]:
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',multi_target_forest)
           ])


In [17]:
proba=pipeline.fit(X_train, y_train).predict_proba(X_test)

In [18]:
# scoring the estimator
y_pred = pipeline.predict(X_test)
pipeline.score(X_test,np.array(y_test))

0.725

In [19]:
pipeline


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Odd-1', 'Odd-N', 'Odd-2',
                                                   'Month']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                         

In [20]:
one_row = pd.DataFrame(X_test.loc[X_test.index[2],:]).T
y_pred=pipeline.predict_proba(one_row)
y_pred

<IPython.core.display.Javascript object>

array([[0.37614679, 0.26605505, 0.35779817]])

In [21]:
one_row

Unnamed: 0,Hteam,Ateam,Month,Odd-1,Odd-N,Odd-2
35751,Sunderland,Man City,8,3.7,3.3,1.9


In [22]:
one_row = pd.DataFrame(['Chelsea','Everton',4,2.34,3.78,3.03]).T
one_row.columns = ['Hteam','Ateam','Month','Odd-1','Odd-N','Odd-2']
one_row

<IPython.core.display.Javascript object>

Unnamed: 0,Hteam,Ateam,Month,Odd-1,Odd-N,Odd-2
0,Chelsea,Everton,4,2.34,3.78,3.03


In [23]:
y_pred=pipeline.predict_proba(one_row)
y_pred

array([[0.18367347, 0.20408163, 0.6122449 ]])

In [24]:
filename = '../foot-odds.pkl'

import pickle
pickle.dump(pipeline, open(filename, 'wb'))

In [25]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(one_row)
print(result)

[2.]


In [26]:
X_test.loc[X_test.index[2],:]

Hteam    Sunderland
Ateam      Man City
Month             8
Odd-1           3.7
Odd-N           3.3
Odd-2           1.9
Name: 35751, dtype: object

In [27]:
X_test.query('Hteam=="Tottenham"')

Unnamed: 0,Hteam,Ateam,Month,Odd-1,Odd-N,Odd-2
3913,Tottenham,Fulham,3,1.48,4.25,6.5
27077,Tottenham,Newcastle,10,1.7,3.6,4.7
27329,Tottenham,Everton,11,2.38,3.65,3.39
15989,Tottenham,Crystal Palace,8,1.46,4.48,7.49
36946,Tottenham,Man United,1,2.84,3.27,2.49
2652,Tottenham,Wigan,3,1.5,4.33,6.5
26703,Tottenham,Liverpool,8,2.7,3.5,2.7
4919,Tottenham,Chelsea,10,2.7,3.0,2.8
2487,Tottenham,Aston Villa,7,1.45,4.28,7.4
27549,Tottenham,Burnley,12,1.5,4.0,6.1
