# 1. Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 2. Read Dataset

In [3]:
df = pd.read_csv('../data/input/input_dataset.csv')
df = df.dropna(axis=0).reset_index(drop=True)

print(df.shape)
df.head()

(3297, 31)


Unnamed: 0,Referee,Day Of Week,Round,Days,Shots,ShotsOT,Corners,Fouls,YCards,RCards,GoalsScored,GoalsConceded,GoalsDiff,HTGoalsScored,HTGoalsConceded,HTGoalsDiff,Points,WinPercent,WinStreak,UnbPercent,UnbStreak,Def,Mid,Att,Ovr,LastSeasonRank,PromotedMatchup,WinnerOdd,DrawOdd,Venue,Result
0,L Probert,Sat,1.0,-4,10.4,6.6,3.8,-1.8,0.0,0.0,0.8,0.4,2.0,0.0,0.4,-2.0,0.4,20.0,0.0,0.0,1.0,7,8,8,7.67,-12,0,-9.71,5.5,Etihad Stadium,W
1,M Halsey,Sat,1.0,1,-1.0,-0.2,0.0,-1.8,0.4,0.0,0.2,0.0,1.0,0.4,0.2,1.0,-0.2,0.0,0.0,-20.0,-1.0,1,1,-2,0.0,1,0,-2.25,3.3,Bet365 Stadium,D
2,L Mason,Sat,1.0,0,6.8,5.0,1.8,-0.2,0.4,0.2,1.6,0.6,5.0,1.4,1.0,2.0,0.4,0.0,0.0,40.0,7.0,3,0,-1,0.67,-6,0,-6.56,4.2,Goodison Park,W
3,L Probert,Sun,1.0,0,3.4,2.2,2.0,0.2,-0.6,0.0,1.8,0.6,6.0,0.4,0.4,0.0,1.0,20.0,0.0,60.0,5.0,1,2,1,1.34,-4,0,-3.58,3.75,Craven Cottage,L
4,P Dowd,Sun,1.0,-1,-2.4,-1.8,-0.6,0.6,-1.0,0.0,-0.4,-1.2,4.0,0.2,0.0,1.0,1.0,20.0,0.0,60.0,6.0,-1,-3,-5,-3.0,-1,0,-1.5,3.3,Carrow Road,W


In [4]:
# label encode categorical feature- Day Of Week
dayOfWeek_le = LabelEncoder()
df['Day Of Week'] = dayOfWeek_le.fit_transform(df['Day Of Week'])

# label encode categorical feature- Referee
referee_le = LabelEncoder()
df['Referee'] = referee_le.fit_transform(df['Referee'])

# label encode categorical feature- Venue
venue_le = LabelEncoder()
df['Venue'] = venue_le.fit_transform(df['Venue'])

# 3. Feature Selection

In [5]:
# features + 'Result'
feature_set_1 = ['Referee', 'Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'PromotedMatchup', 'Venue', 'Result']
feature_set_2 = ['Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'Venue', 'Result']

# 4. Model Training

In [6]:
# get the training set consisting wins, draws, loss

# separate wins, draws, lose into win_df, draw_df, lose_df
win_df = df[df['Result'] == 'W'].sample(frac = 1, random_state=42)
draw_df = df[df['Result'] == 'D'].sample(frac = 1, random_state=42)
lose_df = df[df['Result'] == 'L'].sample(frac = 1, random_state=42)

wdl_train_df = pd.DataFrame(columns=list(df.columns))
wdl_train_df = pd.concat([wdl_train_df, win_df.iloc[0:618, :]])
wdl_train_df = pd.concat([wdl_train_df, draw_df.iloc[0:618, :]])
wdl_train_df = pd.concat([wdl_train_df, lose_df.iloc[0:618, :]])

print(wdl_train_df.Result.value_counts())

# shuffle wdl_train_df
wdl_train_df = wdl_train_df.sample(frac = 1, random_state=42).reset_index(drop=True)

W    618
D    618
L    618
Name: Result, dtype: int64


In [7]:
# get the training set consisting wins, non-wins

# separate wins, non-wins into win_df, non_win_df
win_df = df[df['Result'] == 'W'].sample(frac = 1, random_state=42)

non_win_df = df[(df['Result'] == 'D') | (df['Result'] == 'L')]
non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"}).sample(frac = 1, random_state=42)

wnw_train_df = pd.DataFrame(columns=list(df.columns))
wnw_train_df = pd.concat([wnw_train_df, win_df.iloc[0:1196, :]])
wnw_train_df = pd.concat([wnw_train_df, non_win_df.iloc[0:1196, :]])

print(wnw_train_df.Result.value_counts())

# shuffle wnw_train_df
wnw_train_df = wnw_train_df.sample(frac = 1, random_state=42).reset_index(drop=True)

W     1196
NW    1196
Name: Result, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"}).sample(frac = 1, random_state=42)


### Model 1 
- Model: Logistic Regression
- Target Variable: Win/ Draw/ Lose 
- Sampling: Balanced 
- Features: Feature Set 1

In [8]:
# filter df by Feature Set 1
train_df = wdl_train_df[feature_set_1]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})

# numerical encode
wdl_sc_1 = StandardScaler()
train_df.iloc[:, 1:-3] = wdl_sc_1.fit_transform(train_df.iloc[:, 1:-3])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, 1:-3] = wdl_sc_1.fit_transform(train_df.iloc[:, 1:-3])
  train_df.iloc[:, 1:-3] = wdl_sc_1.fit_transform(train_df.iloc[:, 1:-3])


In [9]:
# hyperparameter tuning

classifier = LogisticRegression(random_state = 0)
param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}
lr_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
lr_classifier.fit(X_train, y_train)

# get the best parameters
print(lr_classifier.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'C': 0.08858667904100823, 'penalty': 'l1', 'solver': 'liblinear'}


In [10]:
# train model1
model1 = LogisticRegression(C=0.08859, penalty='l1', solver='liblinear', random_state = 0)
model1.fit(X_train, y_train)

### Model 2 

- Model: Logistic Regression
- Target Variable: Win/ Draw/ Lose 
- Sampling: Balanced 
- Features: Feature Set 2

In [11]:
# filter df by Feature Set 2
train_df = wdl_train_df[feature_set_2]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})

# numerical encode
wdl_sc_2 = StandardScaler()
train_df.iloc[:, :-2] = wdl_sc_2.fit_transform(train_df.iloc[:, :-2])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, :-2] = wdl_sc_2.fit_transform(train_df.iloc[:, :-2])
  train_df.iloc[:, :-2] = wdl_sc_2.fit_transform(train_df.iloc[:, :-2])


In [12]:
# hyperparameter tuning

classifier = LogisticRegression(random_state = 0)
param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}
lr_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy') 
lr_classifier.fit(X_train, y_train)

# get the best parameters
print(lr_classifier.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'C': 0.03359818286283781, 'penalty': 'l2', 'solver': 'liblinear'}


In [13]:
# train model2
model2 = LogisticRegression(C=0.033598, penalty='l2', solver='liblinear', random_state = 0)
model2.fit(X_train, y_train)

### Model 3

- Model: Logistic Regression
- Target Variable: Win/ Non-Win
- Sampling: Balanced 
- Features: Feature Set 1

In [14]:
# filter df by Feature Set 1
train_df = wnw_train_df[feature_set_1]

# label encode targetr variable
train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})

# numerical encode
wdl_sc_3 = StandardScaler()
train_df.iloc[:, 1:-3] = wdl_sc_3.fit_transform(train_df.iloc[:, 1:-3])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, 1:-3] = wdl_sc_3.fit_transform(train_df.iloc[:, 1:-3])
  train_df.iloc[:, 1:-3] = wdl_sc_3.fit_transform(train_df.iloc[:, 1:-3])


In [15]:
# hyperparameter tuning

classifier = LogisticRegression(random_state = 0)
param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}
lr_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy') 
lr_classifier.fit(X_train, y_train)

# get the best parameters
print(lr_classifier.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'C': 0.0018329807108324356, 'penalty': 'l2', 'solver': 'liblinear'}


In [16]:
# train model3
model3 = LogisticRegression(C=0.001833, penalty='l2', solver='liblinear', random_state = 0)
model3.fit(X_train, y_train)

### Model 4

- Model: Logistic Regression
- Target Variable: Win/ Non-Win
- Sampling: Balanced 
- Features: Feature Set 2

In [17]:
# filter df by Feature Set 2
train_df = wnw_train_df[feature_set_2]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})

# numerical encode
wdl_sc_4 = StandardScaler()
train_df.iloc[:, :-2] = wdl_sc_4.fit_transform(train_df.iloc[:, :-2])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, :-2] = wdl_sc_4.fit_transform(train_df.iloc[:, :-2])
  train_df.iloc[:, :-2] = wdl_sc_4.fit_transform(train_df.iloc[:, :-2])


In [18]:
# hyperparameter tuning

classifier = LogisticRegression(random_state = 0)
param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}
lr_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy') 
lr_classifier.fit(X_train, y_train)

# get the best parameters
print(lr_classifier.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'C': 0.08858667904100823, 'penalty': 'l1', 'solver': 'liblinear'}


In [19]:
# train model4
model4 = LogisticRegression(C=0.08859, penalty='l1', solver='liblinear', random_state = 0)
model4.fit(X_train, y_train)

### Model 5
- Model: Random Forest
- Target Variable: Win/ Draw/ Lose 
- Sampling: Balanced 
- Features: Feature Set 1

In [20]:
# filter df by Feature Set 1
train_df = wdl_train_df[feature_set_1]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})

# numerical encode
wdl_sc_5 = StandardScaler()
train_df.iloc[:, 1:-3] = wdl_sc_5.fit_transform(train_df.iloc[:, 1:-3])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, 1:-3] = wdl_sc_5.fit_transform(train_df.iloc[:, 1:-3])
  train_df.iloc[:, 1:-3] = wdl_sc_5.fit_transform(train_df.iloc[:, 1:-3])


In [21]:
# hyperparameter tuning

classifier = RandomForestClassifier(random_state = 0)
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6],
    'criterion' :['entropy']    
}
rf_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
rf_classifier.fit(X_train, y_train)

# get the best parameters
print(rf_classifier.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 400}


In [22]:
# train model1
model5= RandomForestClassifier(criterion='entropy', max_depth=6, max_features='log2', n_estimators=400, random_state=0)
model5.fit(X_train, y_train)

### Model 6

- Model: Random Forest
- Target Variable: Win/ Draw/ Lose 
- Sampling: Balanced 
- Features: Feature Set 2

In [23]:
# filter df by Feature Set 2
train_df = wdl_train_df[feature_set_2]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})

# numerical encode
wdl_sc_6 = StandardScaler()
train_df.iloc[:, :-2] = wdl_sc_6.fit_transform(train_df.iloc[:, :-2])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'D': 0, 'L': 1, 'W': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, :-2] = wdl_sc_6.fit_transform(train_df.iloc[:, :-2])
  train_df.iloc[:, :-2] = wdl_sc_6.fit_transform(train_df.iloc[:, :-2])


In [24]:
# hyperparameter tuning

classifier = RandomForestClassifier(random_state = 0)
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6],
    'criterion' :['entropy']    
}
rf_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
rf_classifier.fit(X_train, y_train)

# get the best parameters
print(rf_classifier.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 500}


In [25]:
# train model6
model6 = RandomForestClassifier(criterion='entropy', max_depth=6, max_features='sqrt', n_estimators=500, random_state=0)
model6.fit(X_train, y_train)

### Model 7

- Model: Random Forest
- Target Variable: Win/ Non-Win
- Sampling: Balanced 
- Features: Feature Set 1

In [26]:
# filter df by Feature Set 1
train_df = wnw_train_df[feature_set_1]

# label encode targetr variable
train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})

# numerical encode
wdl_sc_7 = StandardScaler()
train_df.iloc[:, 1:-3] = wdl_sc_7.fit_transform(train_df.iloc[:, 1:-3])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, 1:-3] = wdl_sc_7.fit_transform(train_df.iloc[:, 1:-3])
  train_df.iloc[:, 1:-3] = wdl_sc_7.fit_transform(train_df.iloc[:, 1:-3])


In [28]:
# hyperparameter tuning

classifier = RandomForestClassifier(random_state = 0)
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6],
    'criterion' :['entropy']    
}
rf_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
rf_classifier.fit(X_train, y_train)

# get the best parameters
print(rf_classifier.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'criterion': 'entropy', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 500}


In [29]:
# train model7
model7= RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2', n_estimators=500, random_state=0)
model7.fit(X_train, y_train)

### Model 8

- Model: Random Forest
- Target Variable: Win/ Non-Win
- Sampling: Balanced 
- Features: Feature Set 2

In [30]:
# filter df by Feature Set 2
train_df = wnw_train_df[feature_set_2]

# label encode target variable
train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})

# numerical encode
wdl_sc_8 = StandardScaler()
train_df.iloc[:, :-2] = wdl_sc_8.fit_transform(train_df.iloc[:, :-2])

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Result'] = train_df['Result'].map({'W': 0, 'NW': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:, :-2] = wdl_sc_8.fit_transform(train_df.iloc[:, :-2])
  train_df.iloc[:, :-2] = wdl_sc_8.fit_transform(train_df.iloc[:, :-2])


In [31]:
# hyperparameter tuning

classifier = RandomForestClassifier(random_state = 0)
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6],
    'criterion' :['entropy']    
}
rf_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
rf_classifier.fit(X_train, y_train)

# get the best parameters
print(rf_classifier.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 200}


In [32]:
# train model8
model8= RandomForestClassifier(criterion='entropy', max_depth=6, max_features='sqrt', n_estimators=200, random_state=0)
model8.fit(X_train, y_train)

# 5. Betting Simulation

In [33]:
betting_df = pd.read_csv('../data/input/input_dataset_2022_2023.csv')
betting_df.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,B365H,B365D,B365A,Referee,Day Of Week,Round,Days,Shots,ShotsOT,Corners,Fouls,YCards,RCards,GoalsScored,GoalsConceded,GoalsDiff,HTGoalsScored,HTGoalsConceded,HTGoalsDiff,Points,WinPercent,WinStreak,UnbPercent,UnbStreak,Def,Mid,Att,Ovr,LastSeasonRank,PromotedMatchup,WinnerOdd,Venue,Result
0,2022-2023,2022-09-04,united,arsenal,2.5,3.5,2.7,P Tierney,Sun,0,-1,-4.6,-2.2,-2.8,2.0,1.8,0.0,-1.6,0.6,-11.0,-0.8,1.2,-10.0,-1.2,-40.0,-2.0,-40.0,-2.0,4,3,5,4.0,1,0,-0.2,Old Trafford,W
1,2022-2023,2022-09-18,brentford,arsenal,4.2,3.8,1.8,D Coote,Sun,0,0,-5.0,-1.0,-1.4,-2.6,-0.2,0.0,0.2,0.0,1.0,0.4,0.6,-1.0,-0.8,-40.0,1.0,0.0,3.0,-3,-5,-3,-3.67,8,0,2.4,Brentford Community Stadium,L
2,2022-2023,2022-10-16,leeds,arsenal,6.0,4.2,1.55,C Kavanagh,Sun,-1,4,-4.8,-3.4,-0.6,5.8,0.8,0.2,-1.6,0.4,-10.0,-0.8,0.2,-5.0,-2.0,-80.0,-3.0,-40.0,-3.0,-3,-4,-5,-4.0,12,0,4.45,Elland Road,L
3,2022-2023,2022-10-23,southampton,arsenal,6.0,4.0,1.57,R Jones,Sun,1,1,-3.6,-2.4,-0.4,0.2,-0.4,0.0,-1.6,0.4,-10.0,-0.8,0.0,-4.0,-1.6,-60.0,-3.0,-40.0,-2.0,-4,-4,-8,-5.34,10,0,4.43,St Mary's Stadium,D
4,2022-2023,2022-11-06,chelsea,arsenal,2.55,3.4,2.7,M Oliver,Sun,0,1,-3.0,-1.6,-1.0,-1.8,0.4,0.0,-1.2,0.2,-7.0,-0.8,0.2,-5.0,-1.0,-40.0,-1.0,-20.0,-6.0,4,6,0,3.33,-2,0,-0.15,Stamford Bridge,L


In [34]:
# remove unseen-data by label-encoder
referee_le_classes = referee_le.classes_.tolist()
referees = betting_df.Referee.unique().tolist()

# get unseen referees
unseen_refs = []
for ref in referees:
    if ref not in referee_le_classes:
        unseen_refs.append(ref)
        
venue_le_classes = venue_le.classes_.tolist()
venues = betting_df.Venue.unique().tolist()

# get unseen venues
unseen_venues = []
for venue in venues:
    if venue not in venue_le_classes:
        unseen_venues.append(venue)
        
# remove unseen referees
print(betting_df.shape)
betting_df = betting_df[~betting_df['Referee'].isin(unseen_refs)]
betting_df = betting_df[~betting_df['Venue'].isin(unseen_venues)]
print(betting_df.shape)

(330, 37)
(300, 37)


In [35]:
# wdl_betting_df
wdl_betting_df = betting_df.copy()
wdl_betting_df['Result'] = wdl_betting_df['Result'].map({'D': 0, 'L': 1, 'W': 2})

print(wdl_betting_df.shape)
wdl_betting_df.head()

(300, 37)


Unnamed: 0,Season,Date,HomeTeam,AwayTeam,B365H,B365D,B365A,Referee,Day Of Week,Round,Days,Shots,ShotsOT,Corners,Fouls,YCards,RCards,GoalsScored,GoalsConceded,GoalsDiff,HTGoalsScored,HTGoalsConceded,HTGoalsDiff,Points,WinPercent,WinStreak,UnbPercent,UnbStreak,Def,Mid,Att,Ovr,LastSeasonRank,PromotedMatchup,WinnerOdd,Venue,Result
0,2022-2023,2022-09-04,united,arsenal,2.5,3.5,2.7,P Tierney,Sun,0,-1,-4.6,-2.2,-2.8,2.0,1.8,0.0,-1.6,0.6,-11.0,-0.8,1.2,-10.0,-1.2,-40.0,-2.0,-40.0,-2.0,4,3,5,4.0,1,0,-0.2,Old Trafford,2
1,2022-2023,2022-09-18,brentford,arsenal,4.2,3.8,1.8,D Coote,Sun,0,0,-5.0,-1.0,-1.4,-2.6,-0.2,0.0,0.2,0.0,1.0,0.4,0.6,-1.0,-0.8,-40.0,1.0,0.0,3.0,-3,-5,-3,-3.67,8,0,2.4,Brentford Community Stadium,1
2,2022-2023,2022-10-16,leeds,arsenal,6.0,4.2,1.55,C Kavanagh,Sun,-1,4,-4.8,-3.4,-0.6,5.8,0.8,0.2,-1.6,0.4,-10.0,-0.8,0.2,-5.0,-2.0,-80.0,-3.0,-40.0,-3.0,-3,-4,-5,-4.0,12,0,4.45,Elland Road,1
3,2022-2023,2022-10-23,southampton,arsenal,6.0,4.0,1.57,R Jones,Sun,1,1,-3.6,-2.4,-0.4,0.2,-0.4,0.0,-1.6,0.4,-10.0,-0.8,0.0,-4.0,-1.6,-60.0,-3.0,-40.0,-2.0,-4,-4,-8,-5.34,10,0,4.43,St Mary's Stadium,0
4,2022-2023,2022-11-06,chelsea,arsenal,2.55,3.4,2.7,M Oliver,Sun,0,1,-3.0,-1.6,-1.0,-1.8,0.4,0.0,-1.2,0.2,-7.0,-0.8,0.2,-5.0,-1.0,-40.0,-1.0,-20.0,-6.0,4,6,0,3.33,-2,0,-0.15,Stamford Bridge,1


In [36]:
# wnw_betting_df
wnw_betting_df = betting_df.copy()
wnw_betting_df['Result'] = wnw_betting_df['Result'].map({'W': 0, 'D': 1, 'L': 1})

print(wnw_betting_df.shape)
wnw_betting_df.head()

(300, 37)


Unnamed: 0,Season,Date,HomeTeam,AwayTeam,B365H,B365D,B365A,Referee,Day Of Week,Round,Days,Shots,ShotsOT,Corners,Fouls,YCards,RCards,GoalsScored,GoalsConceded,GoalsDiff,HTGoalsScored,HTGoalsConceded,HTGoalsDiff,Points,WinPercent,WinStreak,UnbPercent,UnbStreak,Def,Mid,Att,Ovr,LastSeasonRank,PromotedMatchup,WinnerOdd,Venue,Result
0,2022-2023,2022-09-04,united,arsenal,2.5,3.5,2.7,P Tierney,Sun,0,-1,-4.6,-2.2,-2.8,2.0,1.8,0.0,-1.6,0.6,-11.0,-0.8,1.2,-10.0,-1.2,-40.0,-2.0,-40.0,-2.0,4,3,5,4.0,1,0,-0.2,Old Trafford,0
1,2022-2023,2022-09-18,brentford,arsenal,4.2,3.8,1.8,D Coote,Sun,0,0,-5.0,-1.0,-1.4,-2.6,-0.2,0.0,0.2,0.0,1.0,0.4,0.6,-1.0,-0.8,-40.0,1.0,0.0,3.0,-3,-5,-3,-3.67,8,0,2.4,Brentford Community Stadium,1
2,2022-2023,2022-10-16,leeds,arsenal,6.0,4.2,1.55,C Kavanagh,Sun,-1,4,-4.8,-3.4,-0.6,5.8,0.8,0.2,-1.6,0.4,-10.0,-0.8,0.2,-5.0,-2.0,-80.0,-3.0,-40.0,-3.0,-3,-4,-5,-4.0,12,0,4.45,Elland Road,1
3,2022-2023,2022-10-23,southampton,arsenal,6.0,4.0,1.57,R Jones,Sun,1,1,-3.6,-2.4,-0.4,0.2,-0.4,0.0,-1.6,0.4,-10.0,-0.8,0.0,-4.0,-1.6,-60.0,-3.0,-40.0,-2.0,-4,-4,-8,-5.34,10,0,4.43,St Mary's Stadium,1
4,2022-2023,2022-11-06,chelsea,arsenal,2.55,3.4,2.7,M Oliver,Sun,0,1,-3.0,-1.6,-1.0,-1.8,0.4,0.0,-1.2,0.2,-7.0,-0.8,0.2,-5.0,-1.0,-40.0,-1.0,-20.0,-6.0,4,6,0,3.33,-2,0,-0.15,Stamford Bridge,1


### Betting Simulation with Model 1

In [37]:
betting_sim_df_1 = wdl_betting_df[['Referee', 'Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'PromotedMatchup', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_1['Referee'] = referee_le.transform(betting_sim_df_1['Referee'])
betting_sim_df_1['Venue'] = venue_le.transform(betting_sim_df_1['Venue'])

# numerical scaling
betting_sim_df_1.iloc[:, 1:6] = wdl_sc_1.transform(betting_sim_df_1.iloc[:, 1:6])

betting_sim_df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_1['Referee'] = referee_le.transform(betting_sim_df_1['Referee'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_1['Venue'] = venue_le.transform(betting_sim_df_1['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_1.iloc[:, 1:6] = wdl_sc_1.transform(

Unnamed: 0,Referee,Shots,Corners,Points,UnbStreak,LastSeasonRank,PromotedMatchup,Venue,B365H,B365D,B365A,Result
0,28,-0.947889,-1.193764,-1.114497,-0.291173,0.035884,0,22,2.5,3.5,2.7,2
1,7,-1.038997,-0.542045,-0.70091,0.678707,0.789453,0,4,4.2,3.8,1.8,1
2,5,-0.993443,-0.169634,-1.941669,-0.485149,1.220063,0,10,6.0,4.2,1.55,1
3,30,-0.720119,-0.076531,-1.528083,-0.291173,1.004758,0,25,6.0,4.0,1.57,0
4,23,-0.583456,-0.355839,-0.907704,-1.067078,-0.287074,0,28,2.55,3.4,2.7,1


In [38]:
profit = []
win_odds = betting_sim_df_1['B365H'].tolist()
draw_odds = betting_sim_df_1['B365D'].tolist()
lose_odds = betting_sim_df_1['B365A'].tolist()
true_result = betting_sim_df_1['Result'].tolist()
model1_prediction = model1.predict(betting_sim_df_1[feature_set_1].iloc[:, :-1]).tolist()

for i in range(len(model1_prediction)):
    pred = model1_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # draw
        if result == 0:
            profit.append(draw_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # lose
        if result == 1:
            profit.append(lose_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 2: # win
        if result == 2:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    else:
        print('You should not be seeing this!')

In [39]:
total = 100
for i in profit:
    total += i
    
total

89.67000000000004

### Betting Simulation with Model 2

In [40]:
betting_sim_df_2 = wdl_betting_df[['Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_2['Venue'] = venue_le.transform(betting_sim_df_2['Venue'])

# numerical scaling
betting_sim_df_2.iloc[:, :5] = wdl_sc_2.transform(betting_sim_df_2.iloc[:, :5])

betting_sim_df_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_2['Venue'] = venue_le.transform(betting_sim_df_2['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_2.iloc[:, :5] = wdl_sc_2.transform(betting_sim_df_2.iloc[:, :5])


Unnamed: 0,Shots,Corners,Points,UnbStreak,LastSeasonRank,Venue,B365H,B365D,B365A,Result
0,-0.947889,-1.193764,-1.114497,-0.291173,0.035884,22,2.5,3.5,2.7,2
1,-1.038997,-0.542045,-0.70091,0.678707,0.789453,4,4.2,3.8,1.8,1
2,-0.993443,-0.169634,-1.941669,-0.485149,1.220063,10,6.0,4.2,1.55,1
3,-0.720119,-0.076531,-1.528083,-0.291173,1.004758,25,6.0,4.0,1.57,0
4,-0.583456,-0.355839,-0.907704,-1.067078,-0.287074,28,2.55,3.4,2.7,1


In [41]:
profit = []
win_odds = betting_sim_df_2['B365H'].tolist()
draw_odds = betting_sim_df_2['B365D'].tolist()
lose_odds = betting_sim_df_2['B365A'].tolist()
true_result = betting_sim_df_2['Result'].tolist()
model2_prediction = model2.predict(betting_sim_df_2[feature_set_2].iloc[:, :-1]).tolist()

for i in range(len(model2_prediction)):
    pred = model2_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # draw
        if result == 0:
            profit.append(draw_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # lose
        if result == 1:
            profit.append(lose_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 2: # win
        if result == 2:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    else:
        print('You should not be seeing this!')

In [42]:
total = 100
for i in profit:
    total += i
    
total

89.99000000000004

### Betting Simulation with Model 3

In [43]:
betting_sim_df_3 = wnw_betting_df[['Referee', 'Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'PromotedMatchup', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_3['Referee'] = referee_le.transform(betting_sim_df_3['Referee'])
betting_sim_df_3['Venue'] = venue_le.transform(betting_sim_df_3['Venue'])

# numerical scaling
betting_sim_df_3.iloc[:, 1:6] = wdl_sc_3.transform(betting_sim_df_3.iloc[:, 1:6])

betting_sim_df_3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_3['Referee'] = referee_le.transform(betting_sim_df_3['Referee'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_3['Venue'] = venue_le.transform(betting_sim_df_3['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_3.iloc[:, 1:6] = wdl_sc_3.transform(

Unnamed: 0,Referee,Shots,Corners,Points,UnbStreak,LastSeasonRank,PromotedMatchup,Venue,B365H,B365D,B365A,Result
0,28,-0.938333,-1.220295,-1.152586,-0.302797,0.074124,0,22,2.5,3.5,2.7,0
1,7,-1.027169,-0.577901,-0.745441,0.617054,0.81687,0,4,4.2,3.8,1.8,1
2,5,-0.982751,-0.210819,-1.966877,-0.486767,1.241296,0,10,6.0,4.2,1.55,1
3,30,-0.716242,-0.119048,-1.559732,-0.302797,1.029083,0,25,6.0,4.0,1.57,1
4,23,-0.582988,-0.39436,-0.949013,-1.038678,-0.244196,0,28,2.55,3.4,2.7,1


In [44]:
profit = []
win_odds = betting_sim_df_3['B365H'].tolist()
true_result = betting_sim_df_3['Result'].tolist()
model3_prediction = model3.predict(betting_sim_df_3[feature_set_1].iloc[:, :-1]).tolist()

for i in range(len(model3_prediction)):
    pred = model3_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # win
        if result == 0:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # non-win
        profit.append(0)
    else:
        print('You should not be seeing this!')

In [45]:
total = 100
for i in profit:
    total += i
    
total

107.57000000000004

### Betting Simulation with Model 4

In [46]:
betting_sim_df_4 = wnw_betting_df[['Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encoding
betting_sim_df_4['Venue'] = venue_le.transform(betting_sim_df_4['Venue'])

# numerical scaling
betting_sim_df_4.iloc[:, :5] = wdl_sc_4.transform(betting_sim_df_4.iloc[:, :5])

betting_sim_df_4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_4['Venue'] = venue_le.transform(betting_sim_df_4['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_4.iloc[:, :5] = wdl_sc_4.transform(betting_sim_df_4.iloc[:, :5])


Unnamed: 0,Shots,Corners,Points,UnbStreak,LastSeasonRank,Venue,B365H,B365D,B365A,Result
0,-0.938333,-1.220295,-1.152586,-0.302797,0.074124,22,2.5,3.5,2.7,0
1,-1.027169,-0.577901,-0.745441,0.617054,0.81687,4,4.2,3.8,1.8,1
2,-0.982751,-0.210819,-1.966877,-0.486767,1.241296,10,6.0,4.2,1.55,1
3,-0.716242,-0.119048,-1.559732,-0.302797,1.029083,25,6.0,4.0,1.57,1
4,-0.582988,-0.39436,-0.949013,-1.038678,-0.244196,28,2.55,3.4,2.7,1


In [47]:
profit = []
win_odds = betting_sim_df_4['B365H'].tolist()
true_result = betting_sim_df_4['Result'].tolist()
model4_prediction = model4.predict(betting_sim_df_4[feature_set_2].iloc[:, :-1]).tolist()

for i in range(len(model4_prediction)):
    pred = model4_prediction[i]
    result = true_result[i]
  
    if pred == 0: # win
        if result == 0:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # non-win
        profit.append(0)
    else:
        print('You should not be seeing this!')

In [48]:
total = 100
for i in profit:
    total += i
    
total

105.57000000000005

### Betting Simulation with Model 5

In [49]:
betting_sim_df_5 = wdl_betting_df[['Referee', 'Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'PromotedMatchup', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_5['Referee'] = referee_le.transform(betting_sim_df_5['Referee'])
betting_sim_df_5['Venue'] = venue_le.transform(betting_sim_df_5['Venue'])

# numerical scaling
betting_sim_df_5.iloc[:, 1:6] = wdl_sc_5.transform(betting_sim_df_5.iloc[:, 1:6])

betting_sim_df_5.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_5['Referee'] = referee_le.transform(betting_sim_df_5['Referee'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_5['Venue'] = venue_le.transform(betting_sim_df_5['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_5.iloc[:, 1:6] = wdl_sc_5.transform(

Unnamed: 0,Referee,Shots,Corners,Points,UnbStreak,LastSeasonRank,PromotedMatchup,Venue,B365H,B365D,B365A,Result
0,28,-0.947889,-1.193764,-1.114497,-0.291173,0.035884,0,22,2.5,3.5,2.7,2
1,7,-1.038997,-0.542045,-0.70091,0.678707,0.789453,0,4,4.2,3.8,1.8,1
2,5,-0.993443,-0.169634,-1.941669,-0.485149,1.220063,0,10,6.0,4.2,1.55,1
3,30,-0.720119,-0.076531,-1.528083,-0.291173,1.004758,0,25,6.0,4.0,1.57,0
4,23,-0.583456,-0.355839,-0.907704,-1.067078,-0.287074,0,28,2.55,3.4,2.7,1


In [50]:
profit = []
win_odds = betting_sim_df_5['B365H'].tolist()
draw_odds = betting_sim_df_5['B365D'].tolist()
lose_odds = betting_sim_df_5['B365A'].tolist()
true_result = betting_sim_df_5['Result'].tolist()
model5_prediction = model5.predict(betting_sim_df_5[feature_set_1].iloc[:, :-1]).tolist()

for i in range(len(model5_prediction)):
    pred = model5_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # draw
        if result == 0:
            profit.append(draw_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # lose
        if result == 1:
            profit.append(lose_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 2: # win
        if result == 2:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    else:
        print('You should not be seeing this!')

In [51]:
total = 100
for i in profit:
    total += i
    
total

49.04

### Betting Simulation with Model 6

In [52]:
betting_sim_df_6 = wdl_betting_df[['Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_6['Venue'] = venue_le.transform(betting_sim_df_6['Venue'])

# numerical scaling
betting_sim_df_6.iloc[:, :5] = wdl_sc_6.transform(betting_sim_df_6.iloc[:, :5])

betting_sim_df_6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_6['Venue'] = venue_le.transform(betting_sim_df_6['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_6.iloc[:, :5] = wdl_sc_6.transform(betting_sim_df_6.iloc[:, :5])


Unnamed: 0,Shots,Corners,Points,UnbStreak,LastSeasonRank,Venue,B365H,B365D,B365A,Result
0,-0.947889,-1.193764,-1.114497,-0.291173,0.035884,22,2.5,3.5,2.7,2
1,-1.038997,-0.542045,-0.70091,0.678707,0.789453,4,4.2,3.8,1.8,1
2,-0.993443,-0.169634,-1.941669,-0.485149,1.220063,10,6.0,4.2,1.55,1
3,-0.720119,-0.076531,-1.528083,-0.291173,1.004758,25,6.0,4.0,1.57,0
4,-0.583456,-0.355839,-0.907704,-1.067078,-0.287074,28,2.55,3.4,2.7,1


In [53]:
profit = []
win_odds = betting_sim_df_6['B365H'].tolist()
draw_odds = betting_sim_df_6['B365D'].tolist()
lose_odds = betting_sim_df_6['B365A'].tolist()
true_result = betting_sim_df_6['Result'].tolist()
model6_prediction = model6.predict(betting_sim_df_6[feature_set_2].iloc[:, :-1]).tolist()

for i in range(len(model6_prediction)):
    pred = model6_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # draw
        if result == 0:
            profit.append(draw_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # lose
        if result == 1:
            profit.append(lose_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 2: # win
        if result == 2:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    else:
        print('You should not be seeing this!')

In [54]:
total = 100
for i in profit:
    total += i
    
total

42.75999999999998

### Betting Simulation with Model 7

In [55]:
betting_sim_df_7 = wnw_betting_df[['Referee', 'Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'PromotedMatchup', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encode
betting_sim_df_7['Referee'] = referee_le.transform(betting_sim_df_7['Referee'])
betting_sim_df_7['Venue'] = venue_le.transform(betting_sim_df_7['Venue'])

# numerical scaling
betting_sim_df_7.iloc[:, 1:6] = wdl_sc_7.transform(betting_sim_df_7.iloc[:, 1:6])

betting_sim_df_7.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_7['Referee'] = referee_le.transform(betting_sim_df_7['Referee'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_7['Venue'] = venue_le.transform(betting_sim_df_7['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_7.iloc[:, 1:6] = wdl_sc_7.transform(

Unnamed: 0,Referee,Shots,Corners,Points,UnbStreak,LastSeasonRank,PromotedMatchup,Venue,B365H,B365D,B365A,Result
0,28,-0.938333,-1.220295,-1.152586,-0.302797,0.074124,0,22,2.5,3.5,2.7,0
1,7,-1.027169,-0.577901,-0.745441,0.617054,0.81687,0,4,4.2,3.8,1.8,1
2,5,-0.982751,-0.210819,-1.966877,-0.486767,1.241296,0,10,6.0,4.2,1.55,1
3,30,-0.716242,-0.119048,-1.559732,-0.302797,1.029083,0,25,6.0,4.0,1.57,1
4,23,-0.582988,-0.39436,-0.949013,-1.038678,-0.244196,0,28,2.55,3.4,2.7,1


In [56]:
profit = []
win_odds = betting_sim_df_7['B365H'].tolist()
true_result = betting_sim_df_7['Result'].tolist()
model7_prediction = model7.predict(betting_sim_df_7[feature_set_1].iloc[:, :-1]).tolist()

for i in range(len(model7_prediction)):
    pred = model7_prediction[i]
    result = true_result[i]
    
#     print(f'{pred}, {result}')
    
    if pred == 0: # win
        if result == 0:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # non-win
        profit.append(0)
    else:
        print('You should not be seeing this!')

In [57]:
total = 100
for i in profit:
    total += i
    
total

103.92000000000003

### Betting Simulation with Model 8

In [58]:
betting_sim_df_8 = wnw_betting_df[['Shots', 'Corners', 'Points', 'UnbStreak', 'LastSeasonRank', 'Venue', 'B365H', 'B365D', 'B365A', 'Result']]

# categorical encoding
betting_sim_df_8['Venue'] = venue_le.transform(betting_sim_df_8['Venue'])

# numerical scaling
betting_sim_df_8.iloc[:, :5] = wdl_sc_8.transform(betting_sim_df_8.iloc[:, :5])

betting_sim_df_8.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_8['Venue'] = venue_le.transform(betting_sim_df_8['Venue'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_sim_df_8.iloc[:, :5] = wdl_sc_8.transform(betting_sim_df_8.iloc[:, :5])


Unnamed: 0,Shots,Corners,Points,UnbStreak,LastSeasonRank,Venue,B365H,B365D,B365A,Result
0,-0.938333,-1.220295,-1.152586,-0.302797,0.074124,22,2.5,3.5,2.7,0
1,-1.027169,-0.577901,-0.745441,0.617054,0.81687,4,4.2,3.8,1.8,1
2,-0.982751,-0.210819,-1.966877,-0.486767,1.241296,10,6.0,4.2,1.55,1
3,-0.716242,-0.119048,-1.559732,-0.302797,1.029083,25,6.0,4.0,1.57,1
4,-0.582988,-0.39436,-0.949013,-1.038678,-0.244196,28,2.55,3.4,2.7,1


In [59]:
profit = []
win_odds = betting_sim_df_8['B365H'].tolist()
true_result = betting_sim_df_8['Result'].tolist()
model8_prediction = model8.predict(betting_sim_df_8[feature_set_2].iloc[:, :-1]).tolist()

for i in range(len(model8_prediction)):
    pred = model8_prediction[i]
    result = true_result[i]
  
    if pred == 0: # win
        if result == 0:
            profit.append(win_odds[i] - 1)
        else:
            profit.append(-1)
    elif pred == 1: # non-win
        profit.append(0)
    else:
        print('You should not be seeing this!')

In [60]:
total = 100
for i in profit:
    total += i
    
total

101.02000000000002