# Imports

In [84]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

import numpy as np

# Loading the Dataframe

In [85]:
pwd

'/home/jonathan/code/rafabertolace/OnThePitch/notebooks'

## Merging the Seasons csv files (2019-2020 untill 2021-2022)

In [86]:
print(os.listdir('../raw_data/All3'))

['I1 (3).csv', 'F1.csv', 'B1 (2).csv:Zone.Identifier', 'F1 (2).csv', 'G1 (2).csv', 'D1.csv', 'I1 (1).csv:Zone.Identifier', 'F1.csv:Zone.Identifier', 'D1 (2).csv:Zone.Identifier', 'I1 (3).csv:Zone.Identifier', 'D1.csv:Zone.Identifier', 'I1 (2).csv', 'Turkey_2021_2022.csv.csv', 'B1 (2).csv', 'B1 (1).csv', 'F1 (1).csv', 'Turkey_2019_2020.csv.csv', 'Turkey_2020_2021.csv.csv', 'G1.csv', 'I1 (1).csv', 'F1 (2).csv:Zone.Identifier', 'D1 (2).csv', 'D1 (1).csv', 'I1 (2).csv:Zone.Identifier', 'B1 (1).csv:Zone.Identifier', 'F1 (1).csv:Zone.Identifier', 'G1 (1).csv', 'D1 (1).csv:Zone.Identifier', 'B1.csv:Zone.Identifier', 'B1.csv']


In [87]:
files = [file for file in os.listdir('../raw_data/All3') if file.endswith('.csv')]
data = pd.DataFrame()

for file in files:
    df = pd.read_csv('../raw_data/All3/' + file)
    df['country']=str(file)[0:2]
    data = pd.concat([data, df])

In [88]:
data['Div'].unique()

array(['I1', 'F1', 'G1', 'D1', 'T1', 'B1'], dtype=object)

In [89]:
for col_name in data.columns: 
    print(col_name)

Div
Date
Time
HomeTeam
AwayTeam
FTHG
FTAG
FTR
HTHG
HTAG
HTR
HS
AS
HST
AST
HF
AF
HC
AC
HY
AY
HR
AR
B365H
B365D
B365A
BWH
BWD
BWA
IWH
IWD
IWA
PSH
PSD
PSA
WHH
WHD
WHA
VCH
VCD
VCA
MaxH
MaxD
MaxA
AvgH
AvgD
AvgA
B365>2.5
B365<2.5
P>2.5
P<2.5
Max>2.5
Max<2.5
Avg>2.5
Avg<2.5
AHh
B365AHH
B365AHA
PAHH
PAHA
MaxAHH
MaxAHA
AvgAHH
AvgAHA
B365CH
B365CD
B365CA
BWCH
BWCD
BWCA
IWCH
IWCD
IWCA
PSCH
PSCD
PSCA
WHCH
WHCD
WHCA
VCCH
VCCD
VCCA
MaxCH
MaxCD
MaxCA
AvgCH
AvgCD
AvgCA
B365C>2.5
B365C<2.5
PC>2.5
PC<2.5
MaxC>2.5
MaxC<2.5
AvgC>2.5
AvgC<2.5
AHCh
B365CAHH
B365CAHA
PCAHH
PCAHA
MaxCAHH
MaxCAHA
AvgCAHH
AvgCAHA
country
Unnamed: 105


# Features Engineering

In [98]:
def feature_engineering(data, b=20, binned=False):
    '''
    This function creates all the columns that will be needed to create the analysis 
    and return the dataframe with all this changes
    
    b is the number of bins that we want to work with. Our start value for b will be 20.
        '''
    #------------------------Number of Goals, Over and Under -----------------------------------
    
    # total number of goals = goals from the home team + goals from visiting team
    data['nb_goals']=data['FTHG']+data['FTAG']

    # boolean: true or false regarding whether they were more than 2.5 goals
    data['over_2.5_goals']=data['nb_goals']>2.5

    # boolean: true or false regarding whether they were less than 2.5 goals
    data['under_2.5_goals']=data['nb_goals']<2.5
    
    #-----------------------------Payout Opening ----------------------------------------------
    
    # payout under 2.5 for Average OPENING odds
    data['payout_avg_under_2.5'] = data['under_2.5_goals']*data['Avg<2.5']

    # payout over 2.5 for Average OPENING odds
    data['payout_avg_over_2.5'] = data['over_2.5_goals']*data['Avg>2.5']

    #payout UNDER 2.5 for PINACLE specifically
    data['payout_under_2.5_pinacle'] = data['under_2.5_goals']*data['P<2.5']

    #payout OVER 2.5 for PINACLE specifically
    data['payout_over_2.5_pinacle'] = data['over_2.5_goals']*data['P>2.5']

    #payout UNDER 2.5 for 365 specifically
    data['payout_under_2.5_365'] = data['under_2.5_goals']*data['B365<2.5']

    #payout OVER 2.5 for 365 specifically
    data['payout_over_2.5_365'] = data['over_2.5_goals']*data['B365>2.5']
    
    #------------------------------Payout Closing --------------------------------------------
    
    # payout under 2.5 for Average CLOSING odds
    data['payout_avg_under_closing_2.5'] = data['under_2.5_goals']*data['AvgC<2.5']

    # payout over 2.5 for Average CLOSING odds
    data['payout_avg_over_closing_2.5'] = data['over_2.5_goals']*data['AvgC>2.5']

    #payout UNDER 2.5 for PINACLE closing ddds specifically
    data['payout_under_2.5_pinacle_closing'] = data['under_2.5_goals']*data['PC<2.5']

    #payout OVER 2.5 for PINACLE closing odds specifically
    data['payout_over_2.5_pinacle_closing'] = data['over_2.5_goals']*data['PC>2.5']

    #payout UNDER 2.5 for 365 closing odds specifically
    data['payout_under_2.5_365_closing'] = data['under_2.5_goals']*data['B365C<2.5']

    #payout OVER 2.5 for 365 closing odds specifically
    data['payout_over_2.5_365_closing'] = data['over_2.5_goals']*data['B365C>2.5']
    
    #-------------------------- Implied Probability Opening ----------------------------------------
    
    #Implied Probability UNDER 2.5 goals for for overall market opening odds (Avg) 
    data['Implied Probability <2.5 avg']=1/data['Avg<2.5']*100

    #Implied Probability OVER 2.5 goals for for overall market opening odds (Avg) 
    data['Implied Probability >2.5 avg']=1/data['Avg>2.5']*100

    #Implied Probability UNDER 2.5 goals for PINACLE
    data['Implied Probability <2.5 pinacle']=1/data['P<2.5']*100

    #Implied Probability OVER 2.5 goals for PINACLE
    data['Implied Probability >2.5 pinacle']=1/data['P>2.5']*100

    #Implied Probability UNDER 2.5 goals for 365
    data['Implied Probability <2.5 365']=1/data['B365<2.5']*100

    #Implied Probability OVER 2.5 goals for 365
    data['Implied Probability >2.5 365']=1/data['B365>2.5']*100
    
    #------------------------- Implied Probability Closing -----------------------------------
    
    #Implied Probability UNDER 2.5 goals for overall market closing odds (AvgC)
    data['Implied Probability <2.5 avg closing']=1/data['AvgC<2.5']*100

    #Implied Probability OVER 2.5 goals for overall market closing odds (AvgC)
    data['Implied Probability >2.5 avg closing']=1/data['AvgC>2.5']*100

    #Implied Probability UNDER 2.5 goals for PINACLE closing odds
    data['Implied Probability <2.5 pinacle closing']=1/data['PC<2.5']*100

    #Implied Probability OVER 2.5 goals for PINACLE closing odds
    data['Implied Probability >2.5 pinacle closing']=1/data['PC>2.5']*100

    #Implied Probability UNDER 2.5 goals for 365 closing odds
    data['Implied Probability <2.5 365 closing']=1/data['B365C<2.5']*100

    #Implied Probability OVER 2.5 goals for 365 closing odds
    data['Implied Probability >2.5 365 closing']=1/data['B365C>2.5']*100
    
    #---------------------------- Binning IP Opening -------------------------------------

    b=b
    bins = np.arange(0, 101, int(100/b))
    bins = bins.tolist()

    #Binning UNDER 2.5 Average Market opening odds
    data['binned <2.5 avg'] = pd.cut(data['Implied Probability <2.5 avg'], bins)

    #Binning Over 2.5 Average Market opening odds
    data['binned >2.5 avg'] = pd.cut(data['Implied Probability >2.5 avg'], bins)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned <2.5 pinacle'] = pd.cut(data['Implied Probability <2.5 pinacle'], bins)

    #Binned OVER 2.5 Pinnacle
    data['binned >2.5 pinacle'] = pd.cut(data['Implied Probability >2.5 pinacle'], bins)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned <2.5 365'] = pd.cut(data['Implied Probability <2.5 365'], bins)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned >2.5 365'] = pd.cut(data['Implied Probability >2.5 365'], bins)
    
    #----------------------------- Binning IP Closing ------------------------------------------------

    #Binning UNDER 2.5 Average Market closing odds
    data['binned <2.5 avg closing'] = pd.cut(data['Implied Probability <2.5 avg closing'], bins)

    #Binning OVER 2.5 Average Market closing odds
    data['binned >2.5 avg closing'] = pd.cut(data['Implied Probability >2.5 avg closing'], bins)

    #Binned UNDER 2.5 Pinnacle closing odds
    data['binned <2.5 pinacle closing'] = pd.cut(data['Implied Probability <2.5 pinacle closing'], bins)

    #Binned OVER 2.5 Pinnacle CLOSING odds
    data['binned >2.5 pinacle closing'] = pd.cut(data['Implied Probability >2.5 pinacle closing'], bins)

    #Binned UNDER 2.5 bet365 CLOSING odds
    data['binned <2.5 365 closing'] = pd.cut(data['Implied Probability <2.5 365 closing'], bins)

    #Binned OVER 2.5 bet365 CLOSING odds
    data['binned >2.5 365 closing'] = pd.cut(data['Implied Probability >2.5 365 closing'], bins)
    
    #---------------------------- Binning Odds Opening ----------------------------------------------------
    
    bins2 = [1, 1.5, 2, 3, 99999]

    #Binning UNDER 2.5 Average Market opening odds
    data['binned odds <2.5 avg'] = pd.cut(data['Avg<2.5'], bins2)

    #Binning Over 2.5 Average Market opening odds
    data['binned odds >2.5 avg'] = pd.cut(data['Avg>2.5'], bins2)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned odds <2.5 pinacle'] = pd.cut(data['P<2.5'], bins2)

    #Binned OVER 2.5 Pinnacle
    data['binned odds >2.5 pinacle'] = pd.cut(data['P>2.5'], bins2)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned odds <2.5 365'] = pd.cut(data['B365<2.5'], bins2)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned odds >2.5 365'] = pd.cut(data['B365>2.5'], bins2)
    
    #----------------------------- Binning Odds Closing ----------------------------------------------------------
    
    #Binning UNDER 2.5 Average Market opening odds
    data['binned odds <2.5 avg closing'] = pd.cut(data['AvgC<2.5'], bins2)

    #Binning Over 2.5 Average Market opening odds
    data['binned odds >2.5 avg closing'] = pd.cut(data['AvgC>2.5'], bins2)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned odds <2.5 pinacle closing'] = pd.cut(data['PC<2.5'], bins2)

    #Binned OVER 2.5 Pinnacle
    data['binned odds >2.5 pinacle closing'] = pd.cut(data['PC>2.5'], bins2)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned odds <2.5 365 closing'] = pd.cut(data['B365C<2.5'], bins2)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned odds >2.5 365 closing'] = pd.cut(data['B365C>2.5'], bins2)
    
    
    #----------------------------- Other Features from D3 ------------------------------------------------------
    
    data['Pin_pays_better_under_boolean'] = data['PC<2.5'] > data['AvgC<2.5']
    data['Pin_pays_better_under_difference'] = data['PC<2.5'] / data['AvgC<2.5']
    data['%vig_p'] = (1 - (1 / (1/data['PC>2.5'] + 1/data['PC<2.5'])))*100
    data['%vig_avg'] = (1 - (1 / (1/data['AvgC>2.5'] + 1/data['AvgC<2.5'])))*100
    data['PC<2.5_P_boolean'] = data['PC<2.5'] < data['P<2.5']
    data['PC<2.5_P_relative_diff'] = data['PC<2.5'] / data['P<2.5']
    
    #----------------------- Odds and probability of the home team scoring under 2.5 -------------------------------
    
    lst1 = []
    lst2 = []
    for i, team in enumerate(data['HomeTeam']):
        date = data['Date'].iloc[i]
        total = len(data[(data['HomeTeam'] == team) & (data['Date'] < date)])
        n_under_home = data[(data['HomeTeam'] == team) & (data['Date'] < date)]['under_2.5_goals'].value_counts()
        try:
            lst1.append(1/(n_under_home[1]/total))
            lst2.append(n_under_home[1]/total)
        except:
            lst1.append(np.nan)
            lst2.append(np.nan)

    data['odds_home_under'] = lst1
    data['prob_home_under'] = lst2
    
    #binning the probability of the home team to have a game of less than 2.5 score
    data['binned prob_home_under'] = pd.cut(data['prob_home_under']*100, bins)
    
    
    #----------------------- Odds and probability of the away team scoring under 2.5 -------------------------------
    
    lst3 = []
    lst4  = []
    for i, team in enumerate(data['AwayTeam']):
        date = data['Date'].iloc[i]
        total2 = len(data[(data['AwayTeam'] == team) & (data['Date'] < date)])
        n_under_away2 = data[(data['AwayTeam'] == team) & (data['Date'] < date)]['under_2.5_goals'].value_counts()
        try:
            lst3.append(1/(n_under_away2[1] / total2))
            lst4.append(n_under_away2[1] / total2)
        except:
            lst3.append(np.nan)
            lst4.append(np.nan)

    data['odds_away_under'] = lst3
    data['prob_away_under'] = lst4
    
    #binning the probability of the away team to have a game of less than 2.5 score
    data['binned prob_away_under'] = pd.cut(data['prob_away_under']*100, bins)

    #-------------------------- Creating the prob and odds of the game -----------------------------------------------
    '''the mean between the probability of the home team to have a score of under 2.5 and the probability 
    of the away team to do the same'''
    
    data['odds_game'] = (data['odds_away_under'] +  data['odds_home_under']) / 2
    data['prob_game'] = (data['prob_away_under'] + data['prob_home_under']) / 2
    
    #-------------------------- OneHotEncoding the binned probabilities columns ------------------------------------------
    

    if b == 5:
        data = data[~data['binned prob_home_under'].isna()]
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data[['binned prob_home_under']])
        bins_encoded = ohe.transform(data[['binned prob_home_under']])
        data["0, 20"], data["20, 40"], data["40, 60"], data["60, 80"], data["80, 100"] = bins_encoded.T
        
    if b == 10:
        data = data[~data['binned prob_home_under'].isna()]
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data[['binned prob_home_under']])
        bins_encoded = ohe.transform(data[['binned prob_home_under']])
        data["0, 10"], data["10, 20"], data["20, 30"], data["30, 40"], data["40, 50"], data["50, 60"], \
        data["60, 70"], data["70, 80"], data["80, 90"], data["90, 100"] = bins_encoded.T
        
    if b == 20:
        data = data[~data['binned prob_home_under'].isna()]
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data[['binned prob_home_under']])
        bins_encoded = ohe.transform(data[['binned prob_home_under']])
        data["0, 5"], data["5, 10"], data["10, 15"], data["15, 20"], data["20, 25"], data["25, 30"], \
        data["30, 35"], data["35, 40"], data["40, 45"], data["45, 50"], data["50, 55"], data["55, 60"], \
        data["60, 65"], data["65, 70"], data["70, 75"], data["75, 80"], data["80, 85"], data["85, 90"], \
        data["90, 95"], data["95, 100"]= bins_encoded.T
    
    
    #------------------------------------ Cleaning the data ---------------------------------------------------------
    
    #data = data.dropna(subset=['HomeTeam', 'AwayTeam'], how='any')
    data = data[~data['HomeTeam'].isna()]
    data = data[~data['AwayTeam'].isna()]
    data = data[~data['PC>2.5'].isna()]
    data.drop(columns=['Unnamed: 105'], inplace=True) #, 'Unnamed: 105' 'Referee', 
    #data.dropna()
    
    return data

In [99]:
data = feature_engineering(data, b=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["0, 20"], data["20, 40"], data["40, 60"], data["60, 80"], data["80, 100"] = bins_encoded.T
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["0, 20"], data["20, 40"], data["40, 60"], data["60, 80"], data["80, 100"] = bins_encoded.T
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["0, 20"]

### Number of Goals, Over and Under

In [100]:
# total number of goals = goals from the home team + goals from visiting team
data['nb_goals']=data['FTHG']+data['FTAG']

# boolean: true or false regarding whether they were more than 2.5 goals
data['over_2.5_goals']=data['nb_goals']>2.5

# boolean: true or false regarding whether they were less than 2.5 goals
data['under_2.5_goals']=data['nb_goals']<2.5

### Payout

  Payout of betting on over/under 2.5 goals: we get 0 if we lose the bet, we get the Avg if we win the bet (Avg = market average of the odds)


#### Payout Opening

In [101]:
# payout under 2.5 for Average OPENING odds
data['payout_avg_under_2.5'] = data['under_2.5_goals']*data['Avg<2.5']

# payout over 2.5 for Average OPENING odds
data['payout_avg_over_2.5'] = data['over_2.5_goals']*data['Avg>2.5']

#payout UNDER 2.5 for PINACLE specifically
data['payout_under_2.5_pinacle'] = data['under_2.5_goals']*data['P<2.5']

#payout OVER 2.5 for PINACLE specifically
data['payout_over_2.5_pinacle'] = data['over_2.5_goals']*data['P>2.5']

#payout UNDER 2.5 for 365 specifically
data['payout_under_2.5_365'] = data['under_2.5_goals']*data['B365<2.5']

#payout OVER 2.5 for 365 specifically
data['payout_over_2.5_365'] = data['over_2.5_goals']*data['B365>2.5']

#### Payout Closing

In [102]:
# payout under 2.5 for Average CLOSING odds
data['payout_avg_under_closing_2.5'] = data['under_2.5_goals']*data['AvgC<2.5']

# payout over 2.5 for Average CLOSING odds
data['payout_avg_over_closing_2.5'] = data['over_2.5_goals']*data['AvgC>2.5']

#payout UNDER 2.5 for PINACLE closing ddds specifically
data['payout_under_2.5_pinacle_closing'] = data['under_2.5_goals']*data['PC<2.5']

#payout OVER 2.5 for PINACLE closing odds specifically
data['payout_over_2.5_pinacle_closing'] = data['over_2.5_goals']*data['PC>2.5']

#payout UNDER 2.5 for 365 closing odds specifically
data['payout_under_2.5_365_closing'] = data['under_2.5_goals']*data['B365C<2.5']

#payout OVER 2.5 for 365 closing odds specifically
data['payout_over_2.5_365_closing'] = data['over_2.5_goals']*data['B365C>2.5']

### Implied Probability

#### Implied Probability Opening

In [103]:
#Implied Probability UNDER 2.5 goals for for overall market opening odds (Avg) 
data['Implied Probability <2.5 avg']=1/data['Avg<2.5']*100

#Implied Probability OVER 2.5 goals for for overall market opening odds (Avg) 
data['Implied Probability >2.5 avg']=1/data['Avg>2.5']*100

#Implied Probability UNDER 2.5 goals for PINACLE
data['Implied Probability <2.5 pinacle']=1/data['P<2.5']*100

#Implied Probability OVER 2.5 goals for PINACLE
data['Implied Probability >2.5 pinacle']=1/data['P>2.5']*100

#Implied Probability UNDER 2.5 goals for 365
data['Implied Probability <2.5 365']=1/data['B365<2.5']*100

#Implied Probability OVER 2.5 goals for 365
data['Implied Probability >2.5 365']=1/data['B365>2.5']*100

#### Implied Probability Closing

In [104]:
#Implied Probability UNDER 2.5 goals for overall market closing odds (AvgC)
data['Implied Probability <2.5 avg closing']=1/data['AvgC<2.5']*100

#Implied Probability OVER 2.5 goals for overall market closing odds (AvgC)
data['Implied Probability >2.5 avg closing']=1/data['AvgC>2.5']*100

#Implied Probability UNDER 2.5 goals for PINACLE closing odds
data['Implied Probability <2.5 pinacle closing']=1/data['PC<2.5']*100

#Implied Probability OVER 2.5 goals for PINACLE closing odds
data['Implied Probability >2.5 pinacle closing']=1/data['PC>2.5']*100

#Implied Probability UNDER 2.5 goals for 365 closing odds
data['Implied Probability <2.5 365 closing']=1/data['B365C<2.5']*100

#Implied Probability OVER 2.5 goals for 365 closing odds
data['Implied Probability >2.5 365 closing']=1/data['B365C>2.5']*100

### Binning the implied probabilities

#### Binning IP Opening

In [105]:
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

#Binning UNDER 2.5 Average Market opening odds
data['binned <2.5 avg'] = pd.cut(data['Implied Probability <2.5 avg'], bins)

#Binning Over 2.5 Average Market opening odds
data['binned >2.5 avg'] = pd.cut(data['Implied Probability >2.5 avg'], bins)

#Binned UNDER 2.5 Pinnacle opening odds
data['binned <2.5 pinacle'] = pd.cut(data['Implied Probability <2.5 pinacle'], bins)

#Binned OVER 2.5 Pinnacle
data['binned >2.5 pinacle'] = pd.cut(data['Implied Probability >2.5 pinacle'], bins)

#Binned UNDER 2.5 bet365 OPENING odds
data['binned <2.5 365'] = pd.cut(data['Implied Probability <2.5 365'], bins)

#Binned OVER 2.5 bet365 OPENING odds
data['binned >2.5 365'] = pd.cut(data['Implied Probability >2.5 365'], bins)

#### Binning IP Closing

In [106]:
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

#Binning UNDER 2.5 Average Market closing odds
data['binned <2.5 avg closing'] = pd.cut(data['Implied Probability <2.5 avg closing'], bins)

#Binning OVER 2.5 Average Market closing odds
data['binned >2.5 avg closing'] = pd.cut(data['Implied Probability >2.5 avg closing'], bins)

#Binned UNDER 2.5 Pinnacle closing odds
data['binned <2.5 pinacle closing'] = pd.cut(data['Implied Probability <2.5 pinacle closing'], bins)

#Binned OVER 2.5 Pinnacle CLOSING odds
data['binned >2.5 pinacle closing'] = pd.cut(data['Implied Probability >2.5 pinacle closing'], bins)

#Binned UNDER 2.5 bet365 CLOSING odds
data['binned <2.5 365 closing'] = pd.cut(data['Implied Probability <2.5 365 closing'], bins)

#Binned OVER 2.5 bet365 CLOSING odds
data['binned >2.5 365 closing'] = pd.cut(data['Implied Probability >2.5 365 closing'], bins)

### Binning the odds

#### Binning Odds Opening

In [107]:
bins = [1, 1.5, 2, 3, 99999]

#Binning UNDER 2.5 Average Market opening odds
data['binned odds <2.5 avg'] = pd.cut(data['Avg<2.5'], bins)

#Binning Over 2.5 Average Market opening odds
data['binned odds >2.5 avg'] = pd.cut(data['Avg>2.5'], bins)

#Binned UNDER 2.5 Pinnacle opening odds
data['binned odds <2.5 pinacle'] = pd.cut(data['P<2.5'], bins)

#Binned OVER 2.5 Pinnacle
data['binned odds >2.5 pinacle'] = pd.cut(data['P>2.5'], bins)

#Binned UNDER 2.5 bet365 OPENING odds
data['binned odds <2.5 365'] = pd.cut(data['B365<2.5'], bins)

#Binned OVER 2.5 bet365 OPENING odds
data['binned odds >2.5 365'] = pd.cut(data['B365>2.5'], bins)

#### Binning Odds Closing

In [108]:
bins = [1, 1.5, 2, 3, 99999]

#Binning UNDER 2.5 Average Market opening odds
data['binned odds <2.5 avg closing'] = pd.cut(data['AvgC<2.5'], bins)

#Binning Over 2.5 Average Market opening odds
data['binned odds >2.5 avg closing'] = pd.cut(data['AvgC>2.5'], bins)

#Binned UNDER 2.5 Pinnacle opening odds
data['binned odds <2.5 pinacle closing'] = pd.cut(data['PC<2.5'], bins)

#Binned OVER 2.5 Pinnacle
data['binned odds >2.5 pinacle closing'] = pd.cut(data['PC>2.5'], bins)

#Binned UNDER 2.5 bet365 OPENING odds
data['binned odds <2.5 365 closing'] = pd.cut(data['B365C<2.5'], bins)

#Binned OVER 2.5 bet365 OPENING odds
data['binned odds >2.5 365 closing'] = pd.cut(data['B365C>2.5'], bins)

In [109]:
data['binned odds <2.5 pinacle closing'].isnull().sum()

0

### Other Features from D3

In [110]:
data['Pin_pays_better_under_boolean'] = data['PC<2.5'] > data['AvgC<2.5']

In [111]:
data['Pin_pays_better_under_difference'] = data['PC<2.5'] / data['AvgC<2.5']

In [112]:
data['%vig_p'] = (1 - (1 / (1/data['PC>2.5'] + 1/data['PC<2.5'])))*100

In [113]:
data['%vig_p_bool'] = data['%vig_p']>3.3

In [114]:
data['%vig_avg'] = (1 - (1 / (1/data['AvgC>2.5'] + 1/data['AvgC<2.5'])))*100

In [115]:
data['PC<2.5_P_boolean'] = data['PC<2.5'] < data['P<2.5']

In [116]:
data['PC<2.5_P_relative_diff'] = data['PC<2.5'] / data['P<2.5']

In [117]:
data['MaxC>2.5_AvgC_relative_diff'] = data['MaxC>2.5']/data['AvgC>2.5']
data['Market_consensus'] = data['MaxC>2.5_AvgC_relative_diff']<1.05

In [118]:
## Adding the Year Feature
data_date = data['Date']
data_time = data['Time']

data_date_2 = pd.to_datetime(data_date, dayfirst = True)
data_time_2 = pd.to_datetime(data_time, dayfirst = True)

data['month'] = pd.DatetimeIndex(data_date_2).month
data['month_after_July'] = data['month']>=7

data['year'] = pd.DatetimeIndex(data_date_2).year
data['year_2021_2022'] = data['year']>=2021

data['hour'] = pd.DatetimeIndex(data_time_2).hour
data['game_starts_after_4pm']=data['hour']>=16

In [119]:
data['game_starts_after_4pm'].sum()

3306

# [KILL] Final Dataset [KILL]

### Opening Markets

In [None]:
#Average Market UNDER opening odds
fdf_under_avg = data[['Implied Probability <2.5 avg','under_2.5_goals','binned <2.5 avg','payout_avg_under_2.5']]

#Average Market OVER opening odds
fdf_over_avg = data[['Implied Probability >2.5 avg','over_2.5_goals','binned >2.5 avg','payout_avg_over_2.5']]

#pinnacle UNDER opening odds
fdf_under_pinacle = data[['Implied Probability <2.5 pinacle','under_2.5_goals','binned <2.5 pinacle','payout_under_2.5_pinacle']]

#pinnacle OVER opening odds
fdf_over_pinacle = data[['Implied Probability >2.5 pinacle','over_2.5_goals','binned >2.5 pinacle','payout_over_2.5_pinacle']]

#bet365 UNDER opening odds
fdf_under_365 = data[['Implied Probability <2.5 365','under_2.5_goals','binned <2.5 365','payout_under_2.5_365']]

#bet365 OVER opening odds
fdf_over_365 = data[['Implied Probability >2.5 365','over_2.5_goals','binned >2.5 365','payout_over_2.5_365']]

### Closing Markets

In [None]:
#Average Market UNDER closing odds
fdf_under_avg_closing = data[['Implied Probability <2.5 avg closing','under_2.5_goals','binned <2.5 avg closing','payout_avg_under_closing_2.5']]

#Average Market OVER opening odds
fdf_over_avg_closing = data[['Implied Probability >2.5 avg closing','over_2.5_goals','binned >2.5 avg closing','payout_avg_over_closing_2.5']]

#pinnacle UNDER closing odds
fdf_under_pinacle_closing = data[['Implied Probability <2.5 pinacle closing','under_2.5_goals','binned <2.5 pinacle closing','payout_under_2.5_pinacle_closing']]

#pinnacle OVER closing odds
fdf_over_pinacle_closing = data[['Implied Probability >2.5 pinacle closing','over_2.5_goals','binned >2.5 pinacle closing','payout_over_2.5_pinacle_closing']]

#bet365 UNDER closing odds
fdf_under_365_closing = data[['Implied Probability <2.5 365 closing','under_2.5_goals','binned <2.5 365 closing','payout_under_2.5_365_closing']]

#bet365 OVER closing odds
fdf_over_365_closing = data[['Implied Probability >2.5 365 closing','over_2.5_goals','binned >2.5 365 closing','payout_over_2.5_365_closing']]

# [KILL] Results per implied prob Average Market [KILL]

### Under

#### Opening Odds

In [None]:
fdf_under_avg_agg = fdf_under_avg.groupby('binned <2.5 avg')['payout_avg_under_2.5'].agg(['count', 'mean', 'median'])

fdf_under_pinacle_agg = fdf_under_pinacle.groupby('binned <2.5 pinacle')['payout_under_2.5_pinacle'].agg(['count', 'mean', 'median'])

fdf_under_bet365_agg = fdf_under_365.groupby('binned <2.5 365')['payout_under_2.5_365'].agg(['count', 'mean', 'median'])

In [None]:
fdf_under_avg_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_under_avg_agg.index.names = ['bin<2.5']

fdf_under_pinacle_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_under_pinacle_agg.index.names = ['bin<2.5']

fdf_under_bet365_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_under_bet365_agg.index.names = ['bin<2.5']

opening_test_1 = fdf_under_avg_agg.merge(fdf_under_pinacle_agg, on='bin<2.5')
opening_test_2 = opening_test_1.merge(fdf_under_bet365_agg, on='bin<2.5')
opening_test_2

#### Closing Odds

In [None]:
fdf_under_avg_closing_gb_agg = fdf_under_avg_closing.groupby('binned <2.5 avg closing')['payout_avg_under_closing_2.5'].agg(['count', 'mean', 'median', mode])

fdf_under_pinacle_closing_gb_agg = fdf_under_pinacle_closing.groupby('binned <2.5 pinacle closing')['payout_under_2.5_pinacle_closing'].agg(['count', 'mean', 'median', mode])

fdf_under_bet365_closing_gb_agg = fdf_under_365_closing.groupby('binned <2.5 365 closing')['payout_under_2.5_365_closing'].agg(['count', 'mean', 'median', mode])

In [None]:
fdf_under_avg_closing_gb_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg", "mode": "mode_avg"}, inplace=True)
fdf_under_avg_closing_gb_agg.index.names = ['bin<2.5']

fdf_under_pinacle_closing_gb_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p", "mode": "mode_p"}, inplace=True)
fdf_under_pinacle_closing_gb_agg.index.names = ['bin<2.5']

fdf_under_bet365_closing_gb_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365", "mode": "mode_b365"}, inplace=True)
fdf_under_bet365_closing_gb_agg.index.names = ['bin<2.5']

closing_test_1 = fdf_under_avg_closing_gb_agg.merge(fdf_under_pinacle_closing_gb_agg, on='bin<2.5')
closing_test_2 = closing_test_1.merge(fdf_under_bet365_closing_gb_agg, on='bin<2.5')
closing_test_2['bins'] = closing_test_2.index
closing_test_2

### Over

#### Opening Odds

In [None]:
fdf_over_avg_agg = fdf_over_avg.groupby('binned >2.5 avg')['payout_avg_over_2.5'].agg(['count', 'mean', 'median'])

fdf_over_pinacle_agg = fdf_over_pinacle.groupby('binned >2.5 pinacle')['payout_over_2.5_pinacle'].agg(['count', 'mean', 'median'])

fdf_over_bet365_agg = fdf_over_365.groupby('binned >2.5 365')['payout_over_2.5_365'].agg(['count', 'mean', 'median'])

In [None]:
fdf_over_avg_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_over_avg_agg.index.names = ['bin>2.5']

fdf_over_pinacle_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_over_pinacle_agg.index.names = ['bin>2.5']

fdf_over_bet365_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_over_bet365_agg.index.names = ['bin>2.5']

over_opening_test_1 = fdf_over_avg_agg.merge(fdf_over_pinacle_agg, on='bin>2.5')
over_opening_test_2 = over_opening_test_1.merge(fdf_over_bet365_agg, on='bin>2.5')
over_opening_test_2

#### Closing Odds

In [None]:
fdf_over_avg_closing_agg = fdf_over_avg_closing.groupby('binned >2.5 avg closing')['payout_avg_over_closing_2.5'].agg(['count', 'mean', 'median'])

fdf_over_pinacle_closing_agg = fdf_over_pinacle_closing.groupby('binned >2.5 pinacle closing')['payout_over_2.5_pinacle_closing'].agg(['count', 'mean', 'median'])

fdf_over_bet365_closing_agg = fdf_over_365_closing.groupby('binned >2.5 365 closing')['payout_over_2.5_365_closing'].agg(['count', 'mean', 'median'])

In [None]:
fdf_over_avg_closing_agg.rename(columns={"count": "count_avg", "mean": "mean_avg", "median": "median_avg"}, inplace=True)
fdf_over_avg_closing_agg.index.names = ['bin>2.5']

fdf_over_pinacle_closing_agg.rename(columns={"count": "count_p", "mean": "mean_p", "median": "median_p"}, inplace=True)
fdf_over_pinacle_closing_agg.index.names = ['bin>2.5']

fdf_over_bet365_closing_agg.rename(columns={"count": "count_b365", "mean": "mean_b365", "median": "median_b365"}, inplace=True)
fdf_over_bet365_closing_agg.index.names = ['bin>2.5']

over_closing_test_1 = fdf_over_avg_closing_agg.merge(fdf_over_pinacle_closing_agg, on='bin>2.5')
over_closing_test_2 = over_closing_test_1.merge(fdf_over_bet365_closing_agg, on='bin>2.5')
over_closing_test_2

# Starting Model

In [31]:
#Pedro's pet! Don't touch it!
"""odd = 2.15
iproba = 1/odd*100
for x in range(0, 20):
    if iproba in closing_test_2.iloc[6]['bins']"""

"odd = 2.15\niproba = 1/odd*100\nfor x in range(0, 20):\n    if iproba in closing_test_2.iloc[6]['bins']"

In [120]:
data_linear_booleans = data.copy()

In [121]:
for col_name in data_linear_booleans.columns: 
    print(col_name)

Div
Date
Time
HomeTeam
AwayTeam
FTHG
FTAG
FTR
HTHG
HTAG
HTR
HS
AS
HST
AST
HF
AF
HC
AC
HY
AY
HR
AR
B365H
B365D
B365A
BWH
BWD
BWA
IWH
IWD
IWA
PSH
PSD
PSA
WHH
WHD
WHA
VCH
VCD
VCA
MaxH
MaxD
MaxA
AvgH
AvgD
AvgA
B365>2.5
B365<2.5
P>2.5
P<2.5
Max>2.5
Max<2.5
Avg>2.5
Avg<2.5
AHh
B365AHH
B365AHA
PAHH
PAHA
MaxAHH
MaxAHA
AvgAHH
AvgAHA
B365CH
B365CD
B365CA
BWCH
BWCD
BWCA
IWCH
IWCD
IWCA
PSCH
PSCD
PSCA
WHCH
WHCD
WHCA
VCCH
VCCD
VCCA
MaxCH
MaxCD
MaxCA
AvgCH
AvgCD
AvgCA
B365C>2.5
B365C<2.5
PC>2.5
PC<2.5
MaxC>2.5
MaxC<2.5
AvgC>2.5
AvgC<2.5
AHCh
B365CAHH
B365CAHA
PCAHH
PCAHA
MaxCAHH
MaxCAHA
AvgCAHH
AvgCAHA
country
nb_goals
over_2.5_goals
under_2.5_goals
payout_avg_under_2.5
payout_avg_over_2.5
payout_under_2.5_pinacle
payout_over_2.5_pinacle
payout_under_2.5_365
payout_over_2.5_365
payout_avg_under_closing_2.5
payout_avg_over_closing_2.5
payout_under_2.5_pinacle_closing
payout_over_2.5_pinacle_closing
payout_under_2.5_365_closing
payout_over_2.5_365_closing
Implied Probability <2.5 avg
Implied Probabilit

In [122]:
data_linear_booleans_lean_P_under = data_linear_booleans[['country','month_after_July','year_2021_2022','game_starts_after_4pm','binned odds <2.5 pinacle closing','Pin_pays_better_under_boolean','Market_consensus','%vig_p_bool','PC<2.5_P_boolean', 'payout_under_2.5_pinacle_closing','0, 20','20, 40','40, 60','60, 80','80, 100']]

In [35]:
# data_model_over_under = data_model_over_under[["Div",
# "Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "nb_goals", "Avg>2.5", "Avg<2.5", "B365>2.5", "B365<2.5", "P>2.5", 
# "P<2.5", "AvgC>2.5", "AvgC<2.5", "B365C>2.5", "B365C<2.5", "PC>2.5", "PC<2.5", "over_2.5_goals", "under_2.5_goals", 
# "payout_avg_under_2.5", "payout_avg_over_2.5", "payout_under_2.5_pinacle", "payout_over_2.5_pinacle", "payout_under_2.5_365", 
# "payout_over_2.5_365", "payout_avg_under_closing_2.5", "payout_avg_over_closing_2.5", "payout_under_2.5_pinacle_closing", 
# "payout_over_2.5_pinacle_closing", "payout_under_2.5_365_closing", "payout_over_2.5_365_closing", "Implied Probability <2.5 avg", 
# "Implied Probability >2.5 avg", "Implied Probability <2.5 pinacle", "Implied Probability >2.5 pinacle", "Implied Probability <2.5 365", 
# "Implied Probability >2.5 365", "Implied Probability <2.5 avg closing", "Implied Probability >2.5 avg closing", "Implied Probability <2.5 pinacle closing", 
# "Implied Probability >2.5 pinacle closing", "Implied Probability <2.5 365 closing", "Implied Probability >2.5 365 closing", 
# "binned <2.5 avg", "binned >2.5 avg", "binned <2.5 pinacle", "binned >2.5 pinacle", "binned <2.5 365", "binned >2.5 365", 
# "binned <2.5 avg closing", "binned >2.5 avg closing", "binned <2.5 pinacle closing", "binned >2.5 pinacle closing", 
# "binned <2.5 365 closing", "binned >2.5 365 closing"]]

In [36]:
# data_model_over_under.dropna(inplace=True)

In [37]:
# data_model_over_under

In [38]:
# OneHotEncoders for Bin Odds

In [123]:
ohe = OneHotEncoder(sparse=False) 
ohe.fit(data_linear_booleans_lean_P_under[['binned odds <2.5 pinacle closing']])
bins_encoded = ohe.transform(data_linear_booleans_lean_P_under[['binned odds <2.5 pinacle closing']])
data_linear_booleans_lean_P_under["1.0_to_1.5"], data_linear_booleans_lean_P_under["1.5_to_2.0"], data_linear_booleans_lean_P_under["2.0_to_3.0"], data_linear_booleans_lean_P_under["3.0_to_99999.0"] = bins_encoded.T
#data_linear_booleans_lean_P_under.drop(columns='binned <2.5 pinacle', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linear_booleans_lean_P_under["1.0_to_1.5"], data_linear_booleans_lean_P_under["1.5_to_2.0"], data_linear_booleans_lean_P_under["2.0_to_3.0"], data_linear_booleans_lean_P_under["3.0_to_99999.0"] = bins_encoded.T
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linear_booleans_lean_P_under["1.0_to_1.5"], data_linear_booleans_lean_P_under["1.5_to_2.0"], data_linear_booleans_lean_P_under["2.0_to_3.0"], data_linear_booleans_lean_P_under["3.0_to_99999.0"] = bins_encoded.T
A value is trying to be set on a copy 

In [125]:
ohe = OneHotEncoder(sparse=False) 
ohe.fit(data_linear_booleans_lean_P_under[['country']])
bins_encoded = ohe.transform(data_linear_booleans_lean_P_under[['country']])
#data_linear_booleans_lean_P_under["country_1"], data_linear_booleans_lean_P_under["country_2"], data_linear_booleans_lean_P_under["country_3"], data_linear_booleans_lean_P_under["country_4"], data_linear_booleans_lean_P_under["country_5"],data_linear_booleans_lean_P_under["country_6"], data_linear_booleans_lean_P_under["country_7"], data_linear_booleans_lean_P_under["country_8"], data_linear_booleans_lean_P_under["country_9"], data_linear_booleans_lean_P_under["country_10"], data_linear_booleans_lean_P_under["country_11"] = bins_encoded.T
data_linear_booleans_lean_P_under["country_1"], data_linear_booleans_lean_P_under["country_2"], data_linear_booleans_lean_P_under["country_3"], data_linear_booleans_lean_P_under["country_4"], data_linear_booleans_lean_P_under["country_5"], data_linear_booleans_lean_P_under["country_6"]= bins_encoded.T
#data_linear_booleans_lean_P_under.drop(columns='binned <2.5 pinacle', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linear_booleans_lean_P_under["country_1"], data_linear_booleans_lean_P_under["country_2"], data_linear_booleans_lean_P_under["country_3"], data_linear_booleans_lean_P_under["country_4"], data_linear_booleans_lean_P_under["country_5"], data_linear_booleans_lean_P_under["country_6"]= bins_encoded.T
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linear_booleans_lean_P_under["country_1"], data_linear_booleans_lean_P_under["country_2"], data_linear_booleans_lean_P_under["country_3"], data_linear_booleans_le

In [41]:
#a =data_linear_booleans_lean_P_under[['country','country_1','country_2','country_3','country_4','country_5','country_6','country_7','country_8','country_9','country_10','country_11']]

In [None]:
#a[a['country_11']==1]

In [None]:
#data_linear_booleans_lean_P_under = data_linear_booleans_lean_P_under[data_linear_booleans_lean_P_under['extra']==0]

In [126]:
#extra
data_linear_booleans_lean_P_under = data_linear_booleans_lean_P_under.drop(['country'],axis=1)

In [127]:
#extra
data_linear_booleans_lean_P_under = data_linear_booleans_lean_P_under.drop(['binned odds <2.5 pinacle closing'],axis=1)

In [128]:
data_linear_booleans_lean_P_under.sum()

month_after_July                    2809.00
year_2021_2022                      3030.00
game_starts_after_4pm               3306.00
Pin_pays_better_under_boolean       4762.00
Market_consensus                    3270.00
%vig_p_bool                         2262.00
PC<2.5_P_boolean                    2527.00
payout_under_2.5_pinacle_closing    5153.33
0, 20                                190.00
20, 40                              1772.00
40, 60                              2634.00
60, 80                               705.00
80, 100                              145.00
1.0_to_1.5                           178.00
1.5_to_2.0                          2141.00
2.0_to_3.0                          2813.00
3.0_to_99999.0                       314.00
country_1                            805.00
country_2                            838.00
country_3                            985.00
country_4                            691.00
country_5                           1079.00
country_6                       

In [46]:
for col_name in data_linear_booleans_lean_P_under.columns: 
    print(col_name)

month_after_July
year_2021_2022
game_starts_after_4pm
Pin_pays_better_under_boolean
Market_consensus
%vig_p_bool
PC<2.5_P_boolean
payout_under_2.5_pinacle_closing
0, 20
20, 40
40, 60
60, 80
80, 100
1.0_to_1.5
1.5_to_2.0
2.0_to_3.0
3.0_to_99999.0
country_1
country_2
country_3
country_4
country_5


In [None]:
# test_model = data_model_over_under.drop(columns=['FTHG', 'FTAG', 'nb_goals', 'over_2.5_goals', 'under_2.5_goals', 
#                                                 'Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'binned <2.5 avg', 
#                                                 'binned >2.5 avg', 'binned >2.5 pinacle', 'binned <2.5 365', 
#                                                  'binned >2.5 365', 'binned <2.5 avg closing', 'binned >2.5 avg closing', 
#                                                 'binned <2.5 pinacle closing', 'binned >2.5 pinacle closing', 'binned <2.5 365 closing', 
#                                                 'binned >2.5 365 closing', 
#                                                 'B365>2.5', 'B365<2.5', 'B365C>2.5', 'B365C<2.5', 'payout_under_2.5_365', 'payout_over_2.5_365', 
#                                                 'payout_under_2.5_365_closing', 'payout_over_2.5_365_closing', 
#                                                 'Implied Probability <2.5 365', 'Implied Probability >2.5 365', 
#                                                 'Implied Probability <2.5 365 closing', 'Implied Probability >2.5 365 closing', 
#                                                 'payout_avg_under_2.5', 'payout_avg_over_2.5', 'payout_under_2.5_pinacle', 
#                                                 'payout_avg_over_closing_2.5', 'payout_over_2.5_pinacle_closing', 
#                                                 'Implied Probability <2.5 avg', 'Implied Probability >2.5 avg', 
#                                                 'Implied Probability <2.5 pinacle', 'Implied Probability >2.5 pinacle', 
#                                                 'Avg>2.5', 'P>2.5', 'AvgC>2.5', 'PC>2.5', 'payout_over_2.5_pinacle', 
#                                                 'Implied Probability >2.5 avg closing', 'Implied Probability >2.5 pinacle closing', 
#                                                 'payout_avg_under_closing_2.5', 'Avg<2.5', 'P<2.5', 'AvgC<2.5', 'PC<2.5', 
#                                                 'Implied Probability <2.5 avg closing', 'Implied Probability <2.5 pinacle closing'])

# test_model

In [129]:
data_linear_booleans_lean_P_under

Unnamed: 0,month_after_July,year_2021_2022,game_starts_after_4pm,Pin_pays_better_under_boolean,Market_consensus,%vig_p_bool,PC<2.5_P_boolean,payout_under_2.5_pinacle_closing,"0, 20","20, 40",...,1.0_to_1.5,1.5_to_2.0,2.0_to_3.0,3.0_to_99999.0,country_1,country_2,country_3,country_4,country_5,country_6
0,True,False,True,True,False,False,False,2.23,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,True,False,True,True,False,False,False,0.00,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,True,False,True,True,True,False,False,1.79,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,True,False,True,True,False,False,True,1.86,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,True,False,True,True,True,False,False,0.00,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,False,True,True,True,True,True,False,0.00,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
302,False,True,True,True,True,True,False,0.00,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
303,False,True,True,True,True,True,False,2.67,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
304,False,True,True,True,True,True,False,0.00,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [130]:
data_linear_booleans_lean_P_under.columns

Index(['month_after_July', 'year_2021_2022', 'game_starts_after_4pm',
       'Pin_pays_better_under_boolean', 'Market_consensus', '%vig_p_bool',
       'PC<2.5_P_boolean', 'payout_under_2.5_pinacle_closing', '0, 20',
       '20, 40', '40, 60', '60, 80', '80, 100', '1.0_to_1.5', '1.5_to_2.0',
       '2.0_to_3.0', '3.0_to_99999.0', 'country_1', 'country_2', 'country_3',
       'country_4', 'country_5', 'country_6'],
      dtype='object')

In [131]:
X = data_linear_booleans_lean_P_under[['Pin_pays_better_under_boolean','Market_consensus','%vig_p_bool','PC<2.5_P_boolean', '1.0_to_1.5', '1.5_to_2.0', '2.0_to_3.0','3.0_to_99999.0']]
X_step2 = data_linear_booleans_lean_P_under[['%vig_p_bool','PC<2.5_P_boolean', '1.0_to_1.5', '1.5_to_2.0', '2.0_to_3.0','3.0_to_99999.0']]
X_step3 = data_linear_booleans_lean_P_under[['country_1','country_2','country_3','country_4','country_5','country_6','country_7','country_8','country_9','country_10','country_11','month_after_July', 'year_2021_2022', 'game_starts_after_4pm','Pin_pays_better_under_boolean','Market_consensus','%vig_p_bool','PC<2.5_P_boolean','1.0_to_1.5', '1.5_to_2.0', '2.0_to_3.0','3.0_to_99999.0','0, 20','20, 40','40, 60','60, 80','80, 100']]
#X_step4 = data_linear_booleans_lean_P_under[['country_1','country_2','country_3','country_4','country_5','country_6','country_7','country_8','country_9','country_10','country_11']]
y = data_linear_booleans_lean_P_under['payout_under_2.5_pinacle_closing']

X_step_joao = data_linear_booleans_lean_P_under[['month_after_July', 'year_2021_2022', 'game_starts_after_4pm','Pin_pays_better_under_boolean','Market_consensus','%vig_p_bool','PC<2.5_P_boolean','1.0_to_1.5', '1.5_to_2.0', '2.0_to_3.0','3.0_to_99999.0','0, 20','20, 40','40, 60','60, 80','80, 100']]
X_step_f= data_linear_booleans_lean_P_under[['0, 20','20, 40','40, 60','60, 80','80, 100']]


KeyError: "['country_7', 'country_8', 'country_9', 'country_10', 'country_11'] not in index"

In [132]:
X_step_f= data_linear_booleans_lean_P_under[['0, 20','20, 40','40, 60','60, 80','80, 100']]


In [None]:
#X_step3['great_countries']=X_step3['country_3']+X_step3['country_7']+X_step3['country_8']+X_step3['country_9']+X_step3['country_10']

In [None]:
#X_step3 = X_step3[X_step3['great_countries']==1]

In [None]:
#X_step3 = X_step3[['country_3','country_7','country_8','country_9','country_10','month_after_July', 'year_2021_2022', 'game_starts_after_4pm','Pin_pays_better_under_boolean','Market_consensus','%vig_p_bool','1.0_to_1.5', '1.5_to_2.0', '2.0_to_3.0','3.0_to_99999.0','0, 20','20, 40','40, 60','60, 80','80, 100']]

In [133]:
#X_step4 = data_linear_booleans_lean_P_under[['country_1','country_2','country_3','country_4','country_5','country_6','country_7','country_8','country_9','country_10','country_11']]
y = data_linear_booleans_lean_P_under['payout_under_2.5_pinacle_closing']

In [134]:
import statsmodels.api as sm

In [135]:
results = sm.OLS(y,X_step_f.astype(float)).fit()

In [136]:
results.summary()

0,1,2,3
Dep. Variable:,payout_under_2.5_pinacle_closing,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.6636
Date:,"Fri, 03 Jun 2022",Prob (F-statistic):,0.617
Time:,00:43:34,Log-Likelihood:,-8069.9
No. Observations:,5446,AIC:,16150.0
Df Residuals:,5441,BIC:,16180.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
"0, 20",0.9794,0.077,12.671,0.000,0.828,1.131
"20, 40",0.9182,0.025,36.280,0.000,0.869,0.968
"40, 60",0.9635,0.021,46.412,0.000,0.923,1.004
"60, 80",0.9317,0.040,23.221,0.000,0.853,1.010
"80, 100",1.0040,0.088,11.348,0.000,0.831,1.177

0,1,2,3
Omnibus:,54905.306,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,562.967
Skew:,0.424,Prob(JB):,5.67e-123
Kurtosis:,1.672,Cond. No.,4.26


In [None]:
# Random Forest

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
my_model = RandomForestRegressor(random_state=0).fit(X_step_joao, y)

In [None]:
y_pred = my_model.predict(X_step_joao)

In [None]:
import sklearn
import shap

In [None]:
r2train = sklearn.metrics.r2_score(y, y_pred)

In [None]:
r2train

In [None]:
# SHAP values

In [None]:
#import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_step_joao)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values, X_step_joao)

#- Vertical location shows what feature it is depicting
#- Color shows whether that feature was high or low for that row of the dataset
#- Horizontal location shows whether the effect of that value caused a higher or lower prediction.

In [None]:
## Each dot represents a row of the data. The horizontal location is the actual value from the dataset, and the vertical location shows what having that value did to the prediction.
#shap.dependence_plot('%vig_p_bool', shap_values, X_step3, dot_size=10)

In [None]:
shap.summary_plot(shap_values, X_step_joao, plot_type="bar")

In [None]:
pip install XGBoost

In [None]:
import xgboost as xgb

In [None]:
m = xgb.XGBRegressor()

In [None]:
X_step3.rename(columns = {'PC<2.5_P_boolean':'PC_under_2.5_P_boolean'}, inplace = True)

In [None]:
X_step3.columns

In [None]:
m.fit(X_step3,y) 

In [None]:
y_pred = m.predict(X_step3)

In [None]:
r2train = sklearn.metrics.r2_score(y, y_pred)

In [None]:
r2train

In [None]:
import XGBoost  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_step3)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values, X_step3)

#- Vertical location shows what feature it is depicting
#- Color shows whether that feature was high or low for that row of the dataset
#- Horizontal location shows whether the effect of that value caused a higher or lower prediction.

In [None]:
data_linear_booleans_lean_P_under = data_linear_booleans_lean_P_under[data_linear_booleans_lean_P_under['payout_under_2.5_pinacle'].isnull()==False]

In [None]:
y

In [None]:
model = LinearRegression()
model_fit = model.fit(X, y)

In [None]:
result = sm.ols(formula='payout_under_2 ~ Pin_pays_better_under_boolean + PC<2.5_P_boolean + 1.0_to_1.5 + 1.5_to_2.0 + 2.0_to_3.0 + 3.0_to_99999.0', data=data_linear_booleans_lean_P_under).fit()
 
    #smf.ols(formula='weight ~ horsepower + cylinders', data=mpg).fit()
    
# printing the summary table
print(result.summary())

In [None]:
model_fit.summary()

In [None]:
#Features that decrease our chances
for x in [2, 1, 0, 13, 4, 12, 11]:
    print(f'{X.columns[x]}: {importance[x]}')

In [None]:
#Features that increase our chances
for x in [6, 8, 7, 5, 9, 10, 3]:
    print(f'{X.columns[x]}: {importance[x]}')