In [1094]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.linalg import lstsq
TABLE_FOLDER = '../../src/data'
RAW_FILE = 'transformed_dataset_v1.csv'

df = pd.read_csv(f'{TABLE_FOLDER}/{RAW_FILE}', parse_dates=['date'])
df.drop('Unnamed: 0', axis=1, inplace=True)

df.dtypes

date                    datetime64[ns]
home_team                       object
away_team                       object
last_h_goals                   float64
last_a_goals                   float64
last_wh_goals                  float64
last_wa_goals                  float64
odd_1                          float64
odd_N                          float64
odd_2                          float64
home_trend_slope               float64
home_trend_intercept           float64
away_trend_slope               float64
away_trend_intercept           float64
target                         float64
dtype: object

In [1096]:
np.unique(df.date)[-4:]

array(['2022-05-16T00:00:00.000000000', '2022-05-17T00:00:00.000000000',
       '2022-05-19T00:00:00.000000000', '2022-05-22T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [1097]:
last_date = df.date.max()
df.query('date == @last_date')

Unnamed: 0,date,home_team,away_team,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2,home_trend_slope,home_trend_intercept,away_trend_slope,away_trend_intercept,target
938,2022-05-22,Liverpool,Wolves,3.0,4.0,8.0,12.0,1.13,9.19,19.26,4.4,3.4,0.2,0.4,0.0
1555,2022-05-22,Chelsea,Watford,5.0,3.0,7.0,13.0,1.19,7.41,15.39,2.5,0.5,-0.4,-0.8,0.0
2205,2022-05-22,Leicester,Southampton,7.0,4.0,11.0,11.0,1.7,4.26,4.54,1.7,5.5,0.6,-3.4,0.0
2648,2022-05-22,Manchester City,Aston Villa,4.0,8.0,21.0,5.0,1.17,8.21,15.57,5.0,-0.2,2.3,-2.9,0.0
4273,2022-05-22,Crystal Palace,Manchester Utd,5.0,5.0,6.0,12.0,2.81,3.47,2.5,4.0,-4.0,0.9,0.3,0.0
4873,2022-05-22,Brighton,West Ham,6.0,8.0,10.0,6.0,2.66,3.46,2.65,3.0,2.0,1.0,2.6,0.0
6083,2022-05-22,Brentford,Leeds,6.0,2.0,8.0,10.0,2.32,3.79,2.88,2.3,4.3,-0.2,1.986027e-16,2.0
6185,2022-05-22,Norwich,Tottenham,13.0,8.0,1.0,2.0,9.94,5.55,1.31,-1.0,5.4,3.5,-0.7,2.0


In [1099]:
dataset = df.copy()


last_date = '2022-05-01' #dataset.date.max()
unseen_index=dataset.query('date >= @last_date').index.tolist()
training_index=dataset.query('date <= @last_date').index.tolist()

data = dataset.loc[training_index,:]
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (6513, 15)
Unseen Data For Predictions: (30, 15)


In [1100]:
for idx, r in data.iterrows():
    if r.target == 0:
        data.loc[idx,'odds']=r.odd_1
    elif r.target == 1:
        data.loc[idx,'odds']=r.odd_N
    else:
        data.loc[idx,'odds']=r.odd_2


#drop all rows with odds >5.9

del_row = data.query('odds>5.9').index.tolist()
data.drop(del_row, inplace=True)
data.shape

(6296, 16)

In [1101]:
r_cols = ['odd_1','odd_N','odd_2','target','date','home_team','away_team']
df = data.drop(r_cols,axis=1).copy()

In [1102]:
# EDA
df.describe()

Unnamed: 0,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,home_trend_slope,home_trend_intercept,away_trend_slope,away_trend_intercept,odds
count,6296.0,6296.0,6296.0,6296.0,6296.0,6296.0,6296.0,6296.0,6296.0
mean,6.77986,6.768742,6.534307,6.586563,2.015867,0.469076,2.043949,-0.474603,2.562424
std,3.01942,3.194149,3.121694,2.978046,1.495391,3.392473,1.483806,3.038341,1.025843
min,0.0,0.0,0.0,0.0,-1.6,-9.2,-1.6,-10.8,1.04
25%,5.0,4.0,4.0,4.0,0.9,-2.1,1.0,-2.6,1.68
50%,7.0,6.0,6.0,6.0,1.9,0.3,2.0,-0.5,2.38
75%,9.0,9.0,8.0,8.0,3.1,2.9,3.1,1.7,3.29
max,20.0,24.0,29.0,20.0,7.0,10.0,6.2,9.9,5.89


In [1103]:
X = df.drop('odds', axis=1).values
y = df.odds.values

In [1104]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1, svd_solver='arpack')
X_pca=pca.fit_transform(X)

In [1105]:
import numpy as np

coefs = np.polyfit(X_pca.flatten(), y, deg=1)
#a, b, c, d = coefs
a, b = coefs
print('Coefficients:', coefs) # [ 0.77788056 -0.3904125 ]

Coefficients: [-0.01075727  2.56242376]


In [1107]:
data_unseen

Unnamed: 0,date,home_team,away_team,last_h_goals,last_a_goals,last_wh_goals,last_wa_goals,odd_1,odd_N,odd_2,home_trend_slope,home_trend_intercept,away_trend_slope,away_trend_intercept,target
0,2022-05-07,Brighton,Manchester Utd,6.0,8.0,8.0,10.0,3.03,3.34,2.44,3.7,0.9,0.8,2.0,0.0
1,2022-05-15,Watford,Leicester,10.0,6.0,3.0,6.0,5.12,3.95,1.69,-1.0,2.4,0.7,0.5,2.0
2,2022-05-02,Manchester Utd,Brentford,11.0,9.0,5.0,4.0,1.76,3.95,4.55,0.9,-0.5,4.4,-2.0,0.0
3,2022-05-22,Liverpool,Wolves,3.0,4.0,8.0,12.0,1.13,9.19,19.26,4.4,3.4,0.2,0.4,0.0
4,2022-05-08,Arsenal,Leeds,7.0,7.0,10.0,7.0,1.52,4.51,6.18,2.3,6.3,4.6,-6.6,0.0
5,2022-05-15,Aston Villa,Crystal Palace,7.0,4.0,6.0,4.0,1.84,3.56,4.54,3.1,-2.7,1.7,4.1,1.0
6,2022-05-22,Chelsea,Watford,5.0,3.0,7.0,13.0,1.19,7.41,15.39,2.5,0.5,-0.4,-0.8,0.0
7,2022-05-22,Leicester,Southampton,7.0,4.0,11.0,11.0,1.7,4.26,4.54,1.7,5.5,0.6,-3.4,0.0
8,2022-05-07,Chelsea,Wolves,6.0,4.0,10.0,9.0,1.37,4.89,9.21,2.5,-1.9,0.5,-2.7,1.0
9,2022-05-22,Manchester City,Aston Villa,4.0,8.0,21.0,5.0,1.17,8.21,15.57,5.0,-0.2,2.3,-2.9,0.0


In [1108]:
X_u = data_unseen.drop(['odd_1','odd_N','odd_2','target','date','home_team','away_team'],axis=1).copy()

In [1109]:
X_u_pca = pca.transform(X_u)



In [1110]:
import numpy as np

scores = []
for x in range(0,data_unseen.shape[0]):
    key=x
    game_day = data_unseen.loc[key,'date']
    home_team = data_unseen.loc[key,'home_team']
    away_team = data_unseen.loc[key,'away_team']
    x_values2=X_u_pca[key] 
    #odd=a*(x_values2**3) + b*(x_values2**2) + c*x_values2 + d
    odd = (a*x_values2) + b
    r = data_unseen.loc[key,['odd_1','odd_N','odd_2','target']].tolist()
    bett = r[:3]
    winner = int(r[3])
    ret = np.abs(bett - odd)

    index_min = np.argmin(ret)
    if winner == index_min: 
        result=True 
    else: 
        result=False
    error = bett[index_min] - odd[0]

    if bett[winner]/odd[0] > 0.95 and bett[winner]/odd[0] < 1.05:
        investment = 3
    else:
        investment = 1
    returns = bett[int(winner)] * investment   
    scores.append([game_day, home_team, away_team, np.round(odd[0],2), r[:3], int(winner), index_min, 
        bett[winner]/odd[0], error, result, returns, investment])

scores = pd.DataFrame(scores)
scores.columns = ['date','home_team','away_team','predict','odds','winner','you_play','diff','error','correct','possible-return','investment']

scores

Unnamed: 0,date,home_team,away_team,predict,odds,winner,you_play,diff,error,correct,possible-return,investment
0,2022-05-07,Brighton,Manchester Utd,2.57,"[3.03, 3.34, 2.44]",0,2,1.17712,-0.134079,False,3.03,1
1,2022-05-15,Watford,Leicester,2.59,"[5.12, 3.95, 1.69]",2,2,0.652008,-0.901993,True,1.69,1
2,2022-05-02,Manchester Utd,Brentford,2.54,"[1.76, 3.95, 4.55]",0,0,0.693056,-0.779476,True,1.76,1
3,2022-05-22,Liverpool,Wolves,2.6,"[1.13, 9.19, 19.26]",0,0,0.434599,-1.470096,True,1.13,1
4,2022-05-08,Arsenal,Leeds,2.53,"[1.52, 4.51, 6.18]",0,0,0.601678,-1.00627,True,1.52,1
5,2022-05-15,Aston Villa,Crystal Palace,2.58,"[1.84, 3.56, 4.54]",1,0,1.379329,-0.740965,False,3.56,1
6,2022-05-22,Chelsea,Watford,2.63,"[1.19, 7.41, 15.39]",0,0,0.453106,-1.436319,True,1.19,1
7,2022-05-22,Leicester,Southampton,2.59,"[1.7, 4.26, 4.54]",0,0,0.656571,-0.889209,True,1.7,1
8,2022-05-07,Chelsea,Wolves,2.59,"[1.37, 4.89, 9.21]",1,0,1.890934,-1.216024,False,4.89,1
9,2022-05-22,Manchester City,Aston Villa,2.49,"[1.17, 8.21, 15.57]",0,0,0.470493,-1.316753,True,1.17,1


In [1111]:
returns_cash = np.round(scores.query('correct==True')['possible-return'].sum()-scores.investment.sum(),2)
total_returns = np.round(scores.query('correct==True')['possible-return'].sum(),2)
invest_cash = scores.investment.sum()
nb_games = scores.shape[0]

print(f'Betting on {nb_games} games a total of {invest_cash}$ will return {total_returns}$ for net: {returns_cash}$')

Betting on 30 games a total of 34$ will return 35.66$ for net: 1.66$
