# Imports

In [54]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
plt.style.use('dark_background')

# Loading the Dataframe

In [2]:
pwd

'/home/pedro/code/rafabertolace/OnThePitch/notebooks'

## Merging the Seasons csv files (2019-2020 untill 2021-2022)

In [3]:
print(os.listdir('../OnThePitch/data'))

['Turkey_2020_2021.csv', '.keep', 'Turkey_2021_2022.csv', 'T1.csv:Zone.Identifier', 'Turkey_2019_2020.csv']


In [4]:
files = [file for file in os.listdir('../OnThePitch/data') if file.endswith('.csv')]
data = pd.DataFrame()

for file in files:
    df = pd.read_csv('../OnThePitch/data/' + file)
    data = pd.concat([data, df])

In [5]:
data.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,T1,11/09/2020,18:00,Rizespor,Fenerbahce,1,2,A,0,0,...,2.11,0.5,1.95,1.9,1.94,1.94,1.99,1.99,1.93,1.9
1,T1,12/09/2020,15:00,Karagumruk,Yeni Malatyaspor,3,0,H,1,0,...,1.93,0.0,2.02,1.83,2.01,1.89,2.08,1.9,2.01,1.84
2,T1,12/09/2020,15:00,Sivasspor,Alanyaspor,0,2,A,0,1,...,2.13,-0.25,1.85,2.0,1.85,2.06,1.93,2.06,1.84,2.01
3,T1,12/09/2020,18:00,Galatasaray,Gaziantep,3,1,H,3,0,...,2.32,-1.0,1.88,1.98,1.94,1.96,1.97,2.04,1.87,1.97
4,T1,12/09/2020,18:00,Goztep,Denizlispor,5,1,H,3,0,...,1.77,-0.25,1.93,1.93,1.94,1.96,1.97,1.98,1.91,1.93


In [6]:
for col_name in data.columns: 
    print(col_name)

Div
Date
Time
HomeTeam
AwayTeam
FTHG
FTAG
FTR
HTHG
HTAG
HTR
HS
AS
HST
AST
HF
AF
HC
AC
HY
AY
HR
AR
B365H
B365D
B365A
BWH
BWD
BWA
IWH
IWD
IWA
PSH
PSD
PSA
WHH
WHD
WHA
VCH
VCD
VCA
MaxH
MaxD
MaxA
AvgH
AvgD
AvgA
B365>2.5
B365<2.5
P>2.5
P<2.5
Max>2.5
Max<2.5
Avg>2.5
Avg<2.5
AHh
B365AHH
B365AHA
PAHH
PAHA
MaxAHH
MaxAHA
AvgAHH
AvgAHA
B365CH
B365CD
B365CA
BWCH
BWCD
BWCA
IWCH
IWCD
IWCA
PSCH
PSCD
PSCA
WHCH
WHCD
WHCA
VCCH
VCCD
VCCA
MaxCH
MaxCD
MaxCA
AvgCH
AvgCD
AvgCA
B365C>2.5
B365C<2.5
PC>2.5
PC<2.5
MaxC>2.5
MaxC<2.5
AvgC>2.5
AvgC<2.5
AHCh
B365CAHH
B365CAHA
PCAHH
PCAHA
MaxCAHH
MaxCAHA
AvgCAHH
AvgCAHA


# Features Engineering

### Number of Goals, Over and Under

In [7]:
# total number of goals = goals from the home team + goals from visiting team
data['nb_goals']=data['FTHG']+data['FTAG']

# boolean: true or false regarding whether they were more than 2.5 goals
data['over_2.5_goals']=data['nb_goals']>2.5

# boolean: true or false regarding whether they were less than 2.5 goals
data['under_2.5_goals']=data['nb_goals']<2.5

### Payout

  Payout of betting on over/under 2.5 goals: we get 0 if we lose the bet, we get the Avg if we win the bet (Avg = market average of the odds)


#### Payout Opening

In [8]:
# payout under 2.5 for Average OPENING odds
data['payout_avg_under_2.5'] = data['under_2.5_goals']*data['Avg<2.5']

# payout over 2.5 for Average OPENING odds
data['payout_avg_over_2.5'] = data['over_2.5_goals']*data['Avg>2.5']

#payout UNDER 2.5 for PINACLE specifically
data['payout_under_2.5_pinacle'] = data['under_2.5_goals']*data['P<2.5']

#payout OVER 2.5 for PINACLE specifically
data['payout_over_2.5_pinacle'] = data['over_2.5_goals']*data['P>2.5']

#payout UNDER 2.5 for 365 specifically
data['payout_under_2.5_365'] = data['under_2.5_goals']*data['B365<2.5']

#payout OVER 2.5 for 365 specifically
data['payout_over_2.5_365'] = data['over_2.5_goals']*data['B365>2.5']

#### Payout Closing

In [9]:
# payout under 2.5 for Average CLOSING odds
data['payout_avg_under_closing_2.5'] = data['under_2.5_goals']*data['AvgC<2.5']

# payout over 2.5 for Average CLOSING odds
data['payout_avg_over_closing_2.5'] = data['over_2.5_goals']*data['AvgC>2.5']

#payout UNDER 2.5 for PINACLE closing ddds specifically
data['payout_under_2.5_pinacle_closing'] = data['under_2.5_goals']*data['PC<2.5']

#payout OVER 2.5 for PINACLE closing odds specifically
data['payout_over_2.5_pinacle_closing'] = data['over_2.5_goals']*data['PC>2.5']

#payout UNDER 2.5 for 365 closing odds specifically
data['payout_under_2.5_365_closing'] = data['under_2.5_goals']*data['B365C<2.5']

#payout OVER 2.5 for 365 closing odds specifically
data['payout_over_2.5_365_closing'] = data['over_2.5_goals']*data['B365C>2.5']

### Implied Probability

#### Implied Probability Opening

In [10]:
#Implied Probability UNDER 2.5 goals for for overall market opening odds (Avg) 
data['Implied Probability <2.5 avg']=1/data['Avg<2.5']*100

#Implied Probability OVER 2.5 goals for for overall market opening odds (Avg) 
data['Implied Probability >2.5 avg']=1/data['Avg>2.5']*100

#Implied Probability UNDER 2.5 goals for PINACLE
data['Implied Probability <2.5 pinacle']=1/data['P<2.5']*100

#Implied Probability OVER 2.5 goals for PINACLE
data['Implied Probability >2.5 pinacle']=1/data['P>2.5']*100

#Implied Probability UNDER 2.5 goals for 365
data['Implied Probability <2.5 365']=1/data['B365<2.5']*100

#Implied Probability OVER 2.5 goals for 365
data['Implied Probability >2.5 365']=1/data['B365>2.5']*100

#### Implied Probability Closing

In [11]:
#Implied Probability UNDER 2.5 goals for overall market closing odds (AvgC)
data['Implied Probability <2.5 avg closing']=1/data['AvgC<2.5']*100

#Implied Probability OVER 2.5 goals for overall market closing odds (AvgC)
data['Implied Probability >2.5 avg closing']=1/data['AvgC>2.5']*100

#Implied Probability UNDER 2.5 goals for PINACLE closing odds
data['Implied Probability <2.5 pinacle closing']=1/data['PC<2.5']*100

#Implied Probability OVER 2.5 goals for PINACLE closing odds
data['Implied Probability >2.5 pinacle closing']=1/data['PC>2.5']*100

#Implied Probability UNDER 2.5 goals for 365 closing odds
data['Implied Probability <2.5 365 closing']=1/data['B365C<2.5']*100

#Implied Probability OVER 2.5 goals for 365 closing odds
data['Implied Probability >2.5 365 closing']=1/data['B365C>2.5']*100

### Binning the implied probabilities

#### Binning IP Opening

In [12]:
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

#Binning UNDER 2.5 Average Market opening odds
data['binned <2.5 avg'] = pd.cut(data['Implied Probability <2.5 avg'], bins)

#Binning Over 2.5 Average Market opening odds
data['binned >2.5 avg'] = pd.cut(data['Implied Probability >2.5 avg'], bins)

#Binned UNDER 2.5 Pinnacle opening odds
data['binned <2.5 pinacle'] = pd.cut(data['Implied Probability <2.5 pinacle'], bins)

#Binned OVER 2.5 Pinnacle
data['binned >2.5 pinacle'] = pd.cut(data['Implied Probability >2.5 pinacle'], bins)

#Binned UNDER 2.5 bet365 OPENING odds
data['binned <2.5 365'] = pd.cut(data['Implied Probability <2.5 365'], bins)

#Binned OVER 2.5 bet365 OPENING odds
data['binned >2.5 365'] = pd.cut(data['Implied Probability >2.5 365'], bins)

#### Binning IP Closing

In [13]:
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

#Binning UNDER 2.5 Average Market closing odds
data['binned <2.5 avg closing'] = pd.cut(data['Implied Probability <2.5 avg closing'], bins)

#Binning OVER 2.5 Average Market closing odds
data['binned >2.5 avg closing'] = pd.cut(data['Implied Probability >2.5 avg closing'], bins)

#Binned UNDER 2.5 Pinnacle closing odds
data['binned <2.5 pinacle closing'] = pd.cut(data['Implied Probability <2.5 pinacle closing'], bins)

#Binned OVER 2.5 Pinnacle CLOSING odds
data['binned >2.5 pinacle closing'] = pd.cut(data['Implied Probability >2.5 pinacle closing'], bins)

#Binned UNDER 2.5 bet365 CLOSING odds
data['binned <2.5 365 closing'] = pd.cut(data['Implied Probability <2.5 365 closing'], bins)

#Binned OVER 2.5 bet365 CLOSING odds
data['binned >2.5 365 closing'] = pd.cut(data['Implied Probability >2.5 365 closing'], bins)

# Final Dataset

### Opening Markets

In [14]:
#Average Market UNDER opening odds
fdf_under_avg = data[['Implied Probability <2.5 avg','under_2.5_goals','binned <2.5 avg','payout_avg_under_2.5']]

#Average Market OVER opening odds
fdf_over_avg = data[['Implied Probability >2.5 avg','over_2.5_goals','binned >2.5 avg','payout_avg_over_2.5']]

#pinnacle UNDER opening odds
fdf_under_pinacle = data[['Implied Probability <2.5 pinacle','under_2.5_goals','binned <2.5 pinacle','payout_under_2.5_pinacle']]

#pinnacle OVER opening odds
fdf_over_pinacle = data[['Implied Probability >2.5 pinacle','over_2.5_goals','binned >2.5 pinacle','payout_over_2.5_pinacle']]

#bet365 UNDER opening odds
fdf_under_365 = data[['Implied Probability <2.5 365','under_2.5_goals','binned <2.5 365','payout_under_2.5_365']]

#bet365 OVER opening odds
fdf_over_365 = data[['Implied Probability >2.5 365','over_2.5_goals','binned >2.5 365','payout_over_2.5_365']]

### Closing Markets

In [15]:
#Average Market UNDER closing odds
fdf_under_avg_closing = data[['Implied Probability <2.5 avg closing','under_2.5_goals','binned <2.5 avg closing','payout_avg_under_closing_2.5']]

#Average Market OVER opening odds
fdf_over_avg_closing = data[['Implied Probability >2.5 avg closing','over_2.5_goals','binned >2.5 avg closing','payout_avg_over_closing_2.5']]

#pinnacle UNDER closing odds
fdf_under_pinacle_closing = data[['Implied Probability <2.5 pinacle closing','under_2.5_goals','binned <2.5 pinacle closing','payout_under_2.5_pinacle_closing']]

#pinnacle OVER closing odds
fdf_over_pinacle_closing = data[['Implied Probability >2.5 pinacle closing','over_2.5_goals','binned >2.5 pinacle closing','payout_over_2.5_pinacle_closing']]

#bet365 UNDER closing odds
fdf_under_365_closing = data[['Implied Probability <2.5 365 closing','under_2.5_goals','binned <2.5 365 closing','payout_under_2.5_365_closing']]

#bet365 OVER closing odds
fdf_over_365_closing = data[['Implied Probability >2.5 365 closing','over_2.5_goals','binned >2.5 365 closing','payout_over_2.5_365_closing']]

# Results per implied prob Average Market

### Under

#### Opening Odds

In [16]:
fdf_under_avg_agg = fdf_under_avg.groupby('binned <2.5 avg')['payout_avg_under_2.5'].agg(['count', 'mean', 'median'])

fdf_under_pinacle_agg = fdf_under_pinacle.groupby('binned <2.5 pinacle')['payout_under_2.5_pinacle'].agg(['count', 'mean', 'median'])

fdf_under_bet365_agg = fdf_under_365.groupby('binned <2.5 365')['payout_under_2.5_365'].agg(['count', 'mean', 'median'])

In [17]:
fdf_under_avg_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_under_avg_agg.index.names = ['bin<2.5']

fdf_under_pinacle_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_under_pinacle_agg.index.names = ['bin<2.5']

fdf_under_bet365_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_under_bet365_agg.index.names = ['bin<2.5']

opening_test_1 = fdf_under_avg_agg.merge(fdf_under_pinacle_agg, on='bin<2.5')
opening_test_2 = opening_test_1.merge(fdf_under_bet365_agg, on='bin<2.5')
opening_test_2

Unnamed: 0_level_0,count_avg,mean_avg,median_avg,count_p,mean_p,median_p,count_b365,mean_b365,median_b365
bin<2.5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0, 5]",0,,,0,,,0,,
"(5, 10]",0,,,0,,,0,,
"(10, 15]",0,,,0,,,0,,
"(15, 20]",0,,,0,,,0,,
"(20, 25]",1,0.0,0.0,1,0.0,0.0,1,0.0,0.0
"(25, 30]",1,0.0,0.0,3,0.0,0.0,5,0.0,0.0
"(30, 35]",10,0.295,0.0,12,0.26,0.0,9,0.344444,0.0
"(35, 40]",36,0.7825,0.0,49,0.956939,0.0,26,0.788462,0.0
"(40, 45]",156,0.756154,0.0,192,0.835833,0.0,137,0.729197,0.0
"(45, 50]",349,1.049083,2.0,398,1.044196,0.0,458,1.023166,0.0


#### Closing Odds

In [18]:
fdf_under_avg_closing_gb_agg = fdf_under_avg_closing.groupby('binned <2.5 avg closing')['payout_avg_under_closing_2.5'].agg(['count', 'mean', 'median', mode])

fdf_under_pinacle_closing_gb_agg = fdf_under_pinacle_closing.groupby('binned <2.5 pinacle closing')['payout_under_2.5_pinacle_closing'].agg(['count', 'mean', 'median', mode])

fdf_under_bet365_closing_gb_agg = fdf_under_365_closing.groupby('binned <2.5 365 closing')['payout_under_2.5_365_closing'].agg(['count', 'mean', 'median', mode])

In [19]:
fdf_under_avg_closing_gb_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg", "mode": "mode_avg"}, inplace=True)
fdf_under_avg_closing_gb_agg.index.names = ['bin<2.5']

fdf_under_pinacle_closing_gb_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p", "mode": "mode_p"}, inplace=True)
fdf_under_pinacle_closing_gb_agg.index.names = ['bin<2.5']

fdf_under_bet365_closing_gb_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365", "mode": "mode_b365"}, inplace=True)
fdf_under_bet365_closing_gb_agg.index.names = ['bin<2.5']

closing_test_1 = fdf_under_avg_closing_gb_agg.merge(fdf_under_pinacle_closing_gb_agg, on='bin<2.5')
closing_test_2 = closing_test_1.merge(fdf_under_bet365_closing_gb_agg, on='bin<2.5')
closing_test_2['bins'] = closing_test_2.index
closing_test_2

Unnamed: 0_level_0,count_avg,mean_avg,median_avg,mode_avg,count_p,mean_p,median_p,mode_p,count_b365,mean_b365,median_b365,mode_b365,bins
bin<2.5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"(0, 5]",0,,,"([], [])",0,,,"([], [])",0,,,"([], [])","(0, 5]"
"(5, 10]",0,,,"([], [])",0,,,"([], [])",0,,,"([], [])","(5, 10]"
"(10, 15]",0,,,"([], [])",0,,,"([], [])",0,,,"([], [])","(10, 15]"
"(15, 20]",0,,,"([], [])",0,,,"([], [])",0,,,"([], [])","(15, 20]"
"(20, 25]",1,0.0,0.0,"([0.0], [1])",2,0.0,0.0,"([0.0], [2])",3,0.0,0.0,"([0.0], [3])","(20, 25]"
"(25, 30]",3,0.0,0.0,"([0.0], [3])",6,0.566667,0.0,"([0.0], [5])",7,0.5,0.0,"([0.0], [6])","(25, 30]"
"(30, 35]",14,0.233571,0.0,"([0.0], [13])",16,1.1025,0.0,"([0.0], [10])",15,0.956667,0.0,"([0.0], [10])","(30, 35]"
"(35, 40]",56,0.996964,0.0,"([0.0], [35])",79,0.573418,0.0,"([0.0], [62])",49,0.744898,0.0,"([0.0], [35])","(35, 40]"
"(40, 45]",167,0.760898,0.0,"([0.0], [112])",187,0.990428,0.0,"([0.0], [108])",148,0.8,0.0,"([0.0], [97])","(40, 45]"
"(45, 50]",315,1.059079,2.0,"([0.0], [156])",329,1.064468,2.0,"([0.0], [163])",391,1.031867,0.0,"([0.0], [197])","(45, 50]"


### Over

#### Opening Odds

In [20]:
fdf_over_avg_agg = fdf_over_avg.groupby('binned >2.5 avg')['payout_avg_over_2.5'].agg(['count', 'mean', 'median'])

fdf_over_pinacle_agg = fdf_over_pinacle.groupby('binned >2.5 pinacle')['payout_over_2.5_pinacle'].agg(['count', 'mean', 'median'])

fdf_over_bet365_agg = fdf_over_365.groupby('binned >2.5 365')['payout_over_2.5_365'].agg(['count', 'mean', 'median'])

In [21]:
fdf_over_avg_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_over_avg_agg.index.names = ['bin>2.5']

fdf_over_pinacle_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_over_pinacle_agg.index.names = ['bin>2.5']

fdf_over_bet365_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_over_bet365_agg.index.names = ['bin>2.5']

over_opening_test_1 = fdf_over_avg_agg.merge(fdf_over_pinacle_agg, on='bin>2.5')
over_opening_test_2 = over_opening_test_1.merge(fdf_over_bet365_agg, on='bin>2.5')
over_opening_test_2

Unnamed: 0_level_0,count_avg,mean_avg,median_avg,count_p,mean_p,median_p,count_b365,mean_b365,median_b365
bin>2.5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0, 5]",0,,,0,,,0,,
"(5, 10]",0,,,0,,,0,,
"(10, 15]",0,,,0,,,0,,
"(15, 20]",0,,,0,,,0,,
"(20, 25]",0,,,0,,,0,,
"(25, 30]",0,,,0,,,0,,
"(30, 35]",0,,,0,,,0,,
"(35, 40]",2,1.31,1.31,2,1.39,1.39,2,1.3,1.3
"(40, 45]",23,1.104348,0.0,46,0.912391,0.0,23,1.097826,0.0
"(45, 50]",185,1.016757,0.0,224,1.031205,0.0,246,1.03435,2.0


#### Closing Odds

In [22]:
fdf_over_avg_closing_agg = fdf_over_avg_closing.groupby('binned >2.5 avg closing')['payout_avg_over_closing_2.5'].agg(['count', 'mean', 'median'])

fdf_over_pinacle_closing_agg = fdf_over_pinacle_closing.groupby('binned >2.5 pinacle closing')['payout_over_2.5_pinacle_closing'].agg(['count', 'mean', 'median'])

fdf_over_bet365_closing_agg = fdf_over_365_closing.groupby('binned >2.5 365 closing')['payout_over_2.5_365_closing'].agg(['count', 'mean', 'median'])

In [23]:
fdf_over_avg_closing_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_over_avg_closing_agg.index.names = ['bin>2.5']

fdf_over_pinacle_closing_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_over_pinacle_closing_agg.index.names = ['bin>2.5']

fdf_over_bet365_closing_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_over_bet365_closing_agg.index.names = ['bin>2.5']

over_closing_test_1 = fdf_over_avg_closing_agg.merge(fdf_over_pinacle_closing_agg, on='bin>2.5')
over_closing_test_2 = over_closing_test_1.merge(fdf_over_bet365_closing_agg, on='bin>2.5')
over_closing_test_2

Unnamed: 0_level_0,count_avg,mean_avg,median_avg,count_p,mean_p,median_p,count_b365,mean_b365,median_b365
bin>2.5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0, 5]",0,,,0,,,0,,
"(5, 10]",0,,,0,,,0,,
"(10, 15]",0,,,0,,,0,,
"(15, 20]",0,,,0,,,0,,
"(20, 25]",0,,,0,,,0,,
"(25, 30]",0,,,0,,,0,,
"(30, 35]",0,,,1,0.0,0.0,0,,
"(35, 40]",5,0.502,0.0,12,1.0725,0.0,6,0.833333,0.0
"(40, 45]",46,1.256739,2.23,82,1.155732,1.115,53,1.083962,0.0
"(45, 50]",191,0.976545,0.0,203,0.931034,0.0,236,0.975636,0.0


# Starting Model

In [25]:
#Pedro's pet! Don't touch it!
"""odd = 2.15
iproba = 1/odd*100
for x in range(0, 20):
    if iproba in closing_test_2.iloc[6]['bins']"""

"odd = 2.15\niproba = 1/odd*100\nfor x in range(0, 20):\n    if iproba in closing_test_2.iloc[6]['bins']"

In [26]:
data_model_over_under = data.copy()

In [27]:
data_model_over_under = data_model_over_under[["Div",
"Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "nb_goals", "Avg>2.5", "Avg<2.5", "B365>2.5", "B365<2.5", "P>2.5", 
"P<2.5", "AvgC>2.5", "AvgC<2.5", "B365C>2.5", "B365C<2.5", "PC>2.5", "PC<2.5", "over_2.5_goals", "under_2.5_goals", 
"payout_avg_under_2.5", "payout_avg_over_2.5", "payout_under_2.5_pinacle", "payout_over_2.5_pinacle", "payout_under_2.5_365", 
"payout_over_2.5_365", "payout_avg_under_closing_2.5", "payout_avg_over_closing_2.5", "payout_under_2.5_pinacle_closing", 
"payout_over_2.5_pinacle_closing", "payout_under_2.5_365_closing", "payout_over_2.5_365_closing", "Implied Probability <2.5 avg", 
"Implied Probability >2.5 avg", "Implied Probability <2.5 pinacle", "Implied Probability >2.5 pinacle", "Implied Probability <2.5 365", 
"Implied Probability >2.5 365", "Implied Probability <2.5 avg closing", "Implied Probability >2.5 avg closing", "Implied Probability <2.5 pinacle closing", 
"Implied Probability >2.5 pinacle closing", "Implied Probability <2.5 365 closing", "Implied Probability >2.5 365 closing", 
"binned <2.5 avg", "binned >2.5 avg", "binned <2.5 pinacle", "binned >2.5 pinacle", "binned <2.5 365", "binned >2.5 365", 
"binned <2.5 avg closing", "binned >2.5 avg closing", "binned <2.5 pinacle closing", "binned >2.5 pinacle closing", 
"binned <2.5 365 closing", "binned >2.5 365 closing"]]

In [28]:
data_model_over_under.dropna(inplace=True)

In [29]:
data_model_over_under

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,nb_goals,Avg>2.5,Avg<2.5,...,binned <2.5 pinacle,binned >2.5 pinacle,binned <2.5 365,binned >2.5 365,binned <2.5 avg closing,binned >2.5 avg closing,binned <2.5 pinacle closing,binned >2.5 pinacle closing,binned <2.5 365 closing,binned >2.5 365 closing
0,T1,11/09/2020,18:00,Rizespor,Fenerbahce,1,2,3,1.77,2.04,...,"(45, 50]","(55, 60]","(45, 50]","(55, 60]","(45, 50]","(55, 60]","(45, 50]","(55, 60]","(45, 50]","(55, 60]"
1,T1,12/09/2020,15:00,Karagumruk,Yeni Malatyaspor,3,0,3,1.84,1.95,...,"(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]","(50, 55]"
2,T1,12/09/2020,15:00,Sivasspor,Alanyaspor,0,2,2,1.67,2.17,...,"(45, 50]","(55, 60]","(45, 50]","(60, 65]","(45, 50]","(55, 60]","(45, 50]","(55, 60]","(45, 50]","(55, 60]"
3,T1,12/09/2020,18:00,Galatasaray,Gaziantep,3,1,4,1.62,2.27,...,"(40, 45]","(60, 65]","(40, 45]","(60, 65]","(40, 45]","(60, 65]","(40, 45]","(60, 65]","(40, 45]","(60, 65]"
4,T1,12/09/2020,18:00,Goztep,Denizlispor,5,1,6,2.09,1.73,...,"(55, 60]","(45, 50]","(55, 60]","(45, 50]","(55, 60]","(45, 50]","(50, 55]","(45, 50]","(55, 60]","(45, 50]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,T1,25/07/2020,19:00,Genclerbirligi,Besiktas,0,3,3,1.55,2.39,...,"(40, 45]","(60, 65]","(40, 45]","(60, 65]","(35, 40]","(65, 70]","(35, 40]","(65, 70]","(30, 35]","(70, 75]"
301,T1,25/07/2020,19:00,Goztep,Sivasspor,3,1,4,1.68,2.16,...,"(45, 50]","(55, 60]","(45, 50]","(60, 65]","(40, 45]","(60, 65]","(35, 40]","(60, 65]","(40, 45]","(65, 70]"
302,T1,25/07/2020,19:00,Kayserispor,Trabzonspor,1,2,3,1.51,2.50,...,"(35, 40]","(65, 70]","(35, 40]","(65, 70]","(35, 40]","(65, 70]","(35, 40]","(65, 70]","(35, 40]","(65, 70]"
303,T1,25/07/2020,19:00,Konyaspor,Alanyaspor,2,3,5,1.57,2.35,...,"(40, 45]","(60, 65]","(40, 45]","(60, 65]","(35, 40]","(65, 70]","(35, 40]","(65, 70]","(35, 40]","(65, 70]"


In [30]:
ohe = OneHotEncoder(sparse=False)
ohe.fit(data_model_over_under[['binned <2.5 pinacle']])
bins_encoded = ohe.transform(data_model_over_under[['binned <2.5 pinacle']])
data_model_over_under["(20, 25]"], data_model_over_under["(25, 30]"], data_model_over_under["(30, 35]"], data_model_over_under["(35, 40]"], data_model_over_under["(40, 45]"], data_model_over_under["(45, 50]"], data_model_over_under["(50, 55]"], data_model_over_under["(55, 60]"], data_model_over_under["(60, 65]"], data_model_over_under["(65, 70]"]  = bins_encoded.T
data_model_over_under.drop(columns='binned <2.5 pinacle', inplace=True)

In [31]:
data_model_over_under

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,nb_goals,Avg>2.5,Avg<2.5,...,"(20, 25]","(25, 30]","(30, 35]","(35, 40]","(40, 45]","(45, 50]","(50, 55]","(55, 60]","(60, 65]","(65, 70]"
0,T1,11/09/2020,18:00,Rizespor,Fenerbahce,1,2,3,1.77,2.04,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,T1,12/09/2020,15:00,Karagumruk,Yeni Malatyaspor,3,0,3,1.84,1.95,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,T1,12/09/2020,15:00,Sivasspor,Alanyaspor,0,2,2,1.67,2.17,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,T1,12/09/2020,18:00,Galatasaray,Gaziantep,3,1,4,1.62,2.27,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,T1,12/09/2020,18:00,Goztep,Denizlispor,5,1,6,2.09,1.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,T1,25/07/2020,19:00,Genclerbirligi,Besiktas,0,3,3,1.55,2.39,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
301,T1,25/07/2020,19:00,Goztep,Sivasspor,3,1,4,1.68,2.16,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
302,T1,25/07/2020,19:00,Kayserispor,Trabzonspor,1,2,3,1.51,2.50,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
303,T1,25/07/2020,19:00,Konyaspor,Alanyaspor,2,3,5,1.57,2.35,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [34]:
data_model_over_under['Pin_pays_better_under'] = data_model_over_under['PC<2.5'] > data_model_over_under['AvgC<2.5']

In [40]:
data_model_over_under['%vig_p'] = (1 - (1 / (1/data_model_over_under['PC>2.5'] + 1/data_model_over_under['PC<2.5'])))*100

In [42]:
data_model_over_under['%vig_avg'] = (1 - (1 / (1/data_model_over_under['AvgC>2.5'] + 1/data_model_over_under['AvgC<2.5'])))*100

In [44]:
#data_model_over_under['pin_diff_avg'] = data_model_over_under['PC<2.5']/data_model_over_under['AvgC<2.5']

In [45]:
data_model_over_under['PC<2.5_P_relative_diff'] = data_model_over_under['PC<2.5'] > data_model_over_under['P<2.5']

In [137]:
test_model = data_model_over_under.drop(columns=['FTHG', 'FTAG', 'nb_goals', 'over_2.5_goals', 'under_2.5_goals', 
                                                'Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'binned <2.5 avg', 
                                                'binned >2.5 avg', 'binned >2.5 pinacle', 'binned <2.5 365', 
                                                 'binned >2.5 365', 'binned <2.5 avg closing', 'binned >2.5 avg closing', 
                                                'binned <2.5 pinacle closing', 'binned >2.5 pinacle closing', 'binned <2.5 365 closing', 
                                                'binned >2.5 365 closing', 
                                                'B365>2.5', 'B365<2.5', 'B365C>2.5', 'B365C<2.5', 'payout_under_2.5_365', 'payout_over_2.5_365', 
                                                'payout_under_2.5_365_closing', 'payout_over_2.5_365_closing', 
                                                'Implied Probability <2.5 365', 'Implied Probability >2.5 365', 
                                                'Implied Probability <2.5 365 closing', 'Implied Probability >2.5 365 closing', 
                                                'payout_avg_under_2.5', 'payout_avg_over_2.5', 'payout_under_2.5_pinacle', 
                                                'payout_avg_over_closing_2.5', 'payout_over_2.5_pinacle_closing', 
                                                'Implied Probability <2.5 avg', 'Implied Probability >2.5 avg', 
                                                'Implied Probability <2.5 pinacle', 'Implied Probability >2.5 pinacle', 
                                                'Avg>2.5', 'P>2.5', 'AvgC>2.5', 'PC>2.5', 'payout_over_2.5_pinacle', 
                                                'Implied Probability >2.5 avg closing', 'Implied Probability >2.5 pinacle closing', 
                                                'payout_avg_under_closing_2.5', 'Avg<2.5', 'P<2.5', 'AvgC<2.5', 'PC<2.5', 
                                                'Implied Probability <2.5 avg closing', 'Implied Probability <2.5 pinacle closing'])

test_model

Unnamed: 0,payout_under_2.5_pinacle_closing,"(20, 25]","(25, 30]","(30, 35]","(35, 40]","(40, 45]","(45, 50]","(50, 55]","(55, 60]","(60, 65]","(65, 70]",Pin_pays_better_under,%vig_p,%vig_avg,PC<2.5_P_relative_diff
0,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,3.125000,4.940104,True
1,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,True,2.755784,5.023684,False
2,2.14,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,3.425641,5.148437,False
3,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,3.283582,5.306122,True
4,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,2.869231,5.013089,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,3.572093,5.607656,True
301,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,3.703163,5.669192,True
302,0.00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,True,3.606557,5.662651,True
303,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,3.827751,6.083951,True


In [138]:
test_model.columns

Index(['payout_under_2.5_pinacle_closing', '(20, 25]', '(25, 30]', '(30, 35]',
       '(35, 40]', '(40, 45]', '(45, 50]', '(50, 55]', '(55, 60]', '(60, 65]',
       '(65, 70]', 'Pin_pays_better_under', '%vig_p', '%vig_avg',
       'PC<2.5_P_relative_diff'],
      dtype='object')

In [139]:
test_model['payout_under_2.5_pinacle_closing'] = test_model['payout_under_2.5_pinacle_closing'] > 1

In [147]:
X = test_model.drop(columns='payout_under_2.5_pinacle_closing')
y = test_model['payout_under_2.5_pinacle_closing']

model = LogisticRegression()
model.fit(X, y)
# get importance
importance = model.coef_[0]

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: -0.34734
Feature: 1, Score: -0.69573
Feature: 2, Score: -1.13868
Feature: 3, Score: 0.02628
Feature: 4, Score: -0.08968
Feature: 5, Score: 0.43224
Feature: 6, Score: 0.53207
Feature: 7, Score: 0.47492
Feature: 8, Score: 0.49326
Feature: 9, Score: 0.22909
Feature: 10, Score: 0.14541
Feature: 11, Score: -0.01356
Feature: 12, Score: -0.03877
Feature: 13, Score: -0.42550


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [148]:
#Features that decrease our chances
for x in [2, 1, 0, 13, 4, 12, 11]:
    print(f'{X.columns[x]}: {importance[x]}')

(30, 35]: -1.1386835580034387
(25, 30]: -0.6957268657210008
(20, 25]: -0.3473446755568272
PC<2.5_P_relative_diff: -0.4254970433559474
(40, 45]: -0.08967795310898481
%vig_avg: -0.0387671044559982
%vig_p: -0.013563381866824393


In [149]:
#Features that increase our chances
for x in [6, 8, 7, 5, 9, 10, 3]:
    print(f'{X.columns[x]}: {importance[x]}')

(50, 55]: 0.5320707350421501
(60, 65]: 0.4932593762096327
(55, 60]: 0.474924400009327
(45, 50]: 0.4322410152361808
(65, 70]: 0.22908646916484923
Pin_pays_better_under: 0.14541098605403815
(35, 40]: 0.026277368815788238
