In [1]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np
import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import sklearn
import shap
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_data(league1, wall=False):
    
    if wall:
        data = pd.DataFrame()
        leagues = listdir(f'./../raw_data4/')
        data = pd.DataFrame()
        for league in leagues:
            files = listdir(f'./../raw_data4/{league}')
            for file in files:
                df = pd.read_csv((f'./../raw_data4/{league}/'+file))
                data = pd.concat([data, df])

        return data
    
    else:
        files = [file for file in listdir(f'./../raw_data4/{league1}')]
        data = pd.DataFrame()

        for file in files:
            df = pd.read_csv(f'./../raw_data4/{league1}/'+file)
            data = pd.concat([data, df])

        return data

In [4]:
data = get_data('italy', wall=True)

In [6]:
def feature_engineering(data, b=20, binned=False):
    '''
    This function creates all the columns that will be needed to create the analysis 
    and return the dataframe with all this changes
    
    b is the number of bins that we want to work with. Our start value for b will be 20.
        '''
    #------------------------Number of Goals, Over and Under -----------------------------------
    
    # total number of goals = goals from the home team + goals from visiting team
    data['nb_goals']=data['FTHG']+data['FTAG']

    # boolean: true or false regarding whether they were more than 2.5 goals
    data['over_2.5_goals']=data['nb_goals']>2.5

    # boolean: true or false regarding whether they were less than 2.5 goals
    data['under_2.5_goals']=data['nb_goals']<2.5
    
    #-----------------------------Payout Opening ----------------------------------------------
    
    # payout under 2.5 for Average OPENING odds
    data['payout_avg_under_2.5'] = data['under_2.5_goals']*data['Avg<2.5']

    # payout over 2.5 for Average OPENING odds
    data['payout_avg_over_2.5'] = data['over_2.5_goals']*data['Avg>2.5']

    #payout UNDER 2.5 for PINACLE specifically
    data['payout_under_2.5_pinacle'] = data['under_2.5_goals']*data['P<2.5']

    #payout OVER 2.5 for PINACLE specifically
    data['payout_over_2.5_pinacle'] = data['over_2.5_goals']*data['P>2.5']

    #payout UNDER 2.5 for 365 specifically
    data['payout_under_2.5_365'] = data['under_2.5_goals']*data['B365<2.5']

    #payout OVER 2.5 for 365 specifically
    data['payout_over_2.5_365'] = data['over_2.5_goals']*data['B365>2.5']
    
    #------------------------------Payout Closing --------------------------------------------
    
    # payout under 2.5 for Average CLOSING odds
    data['payout_avg_under_closing_2.5'] = data['under_2.5_goals']*data['AvgC<2.5']

    # payout over 2.5 for Average CLOSING odds
    data['payout_avg_over_closing_2.5'] = data['over_2.5_goals']*data['AvgC>2.5']

    #payout UNDER 2.5 for PINACLE closing ddds specifically
    data['payout_under_2.5_pinacle_closing'] = data['under_2.5_goals']*data['PC<2.5']

    #payout OVER 2.5 for PINACLE closing odds specifically
    data['payout_over_2.5_pinacle_closing'] = data['over_2.5_goals']*data['PC>2.5']

    #payout UNDER 2.5 for 365 closing odds specifically
    data['payout_under_2.5_365_closing'] = data['under_2.5_goals']*data['B365C<2.5']

    #payout OVER 2.5 for 365 closing odds specifically
    data['payout_over_2.5_365_closing'] = data['over_2.5_goals']*data['B365C>2.5']
    
    #-------------------------- Implied Probability Opening ----------------------------------------
    
    #Implied Probability UNDER 2.5 goals for for overall market opening odds (Avg) 
    data['Implied Probability <2.5 avg']=1/data['Avg<2.5']*100

    #Implied Probability OVER 2.5 goals for for overall market opening odds (Avg) 
    data['Implied Probability >2.5 avg']=1/data['Avg>2.5']*100

    #Implied Probability UNDER 2.5 goals for PINACLE
    data['Implied Probability <2.5 pinacle']=1/data['P<2.5']*100

    #Implied Probability OVER 2.5 goals for PINACLE
    data['Implied Probability >2.5 pinacle']=1/data['P>2.5']*100

    #Implied Probability UNDER 2.5 goals for 365
    data['Implied Probability <2.5 365']=1/data['B365<2.5']*100

    #Implied Probability OVER 2.5 goals for 365
    data['Implied Probability >2.5 365']=1/data['B365>2.5']*100
    
    #------------------------- Implied Probability Closing -----------------------------------
    
    #Implied Probability UNDER 2.5 goals for overall market closing odds (AvgC)
    data['Implied Probability <2.5 avg closing']=1/data['AvgC<2.5']*100

    #Implied Probability OVER 2.5 goals for overall market closing odds (AvgC)
    data['Implied Probability >2.5 avg closing']=1/data['AvgC>2.5']*100

    #Implied Probability UNDER 2.5 goals for PINACLE closing odds
    data['Implied Probability <2.5 pinacle closing']=1/data['PC<2.5']*100

    #Implied Probability OVER 2.5 goals for PINACLE closing odds
    data['Implied Probability >2.5 pinacle closing']=1/data['PC>2.5']*100

    #Implied Probability UNDER 2.5 goals for 365 closing odds
    data['Implied Probability <2.5 365 closing']=1/data['B365C<2.5']*100

    #Implied Probability OVER 2.5 goals for 365 closing odds
    data['Implied Probability >2.5 365 closing']=1/data['B365C>2.5']*100
    
    #---------------------------- Binning IP Opening -------------------------------------

    b=b
    bins = np.arange(0, 101, int(100/b))
    bins = bins.tolist()

    #Binning UNDER 2.5 Average Market opening odds
    data['binned <2.5 avg'] = pd.cut(data['Implied Probability <2.5 avg'], bins)

    #Binning Over 2.5 Average Market opening odds
    data['binned >2.5 avg'] = pd.cut(data['Implied Probability >2.5 avg'], bins)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned <2.5 pinacle'] = pd.cut(data['Implied Probability <2.5 pinacle'], bins)

    #Binned OVER 2.5 Pinnacle
    data['binned >2.5 pinacle'] = pd.cut(data['Implied Probability >2.5 pinacle'], bins)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned <2.5 365'] = pd.cut(data['Implied Probability <2.5 365'], bins)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned >2.5 365'] = pd.cut(data['Implied Probability >2.5 365'], bins)
    
    #----------------------------- Binning IP Closing ------------------------------------------------

    #Binning UNDER 2.5 Average Market closing odds
    data['binned <2.5 avg closing'] = pd.cut(data['Implied Probability <2.5 avg closing'], bins)

    #Binning OVER 2.5 Average Market closing odds
    data['binned >2.5 avg closing'] = pd.cut(data['Implied Probability >2.5 avg closing'], bins)

    #Binned UNDER 2.5 Pinnacle closing odds
    data['binned <2.5 pinacle closing'] = pd.cut(data['Implied Probability <2.5 pinacle closing'], bins)

    #Binned OVER 2.5 Pinnacle CLOSING odds
    data['binned >2.5 pinacle closing'] = pd.cut(data['Implied Probability >2.5 pinacle closing'], bins)

    #Binned UNDER 2.5 bet365 CLOSING odds
    data['binned <2.5 365 closing'] = pd.cut(data['Implied Probability <2.5 365 closing'], bins)

    #Binned OVER 2.5 bet365 CLOSING odds
    data['binned >2.5 365 closing'] = pd.cut(data['Implied Probability >2.5 365 closing'], bins)
    
    #---------------------------- Binning Odds Opening ----------------------------------------------------
    
    bins2 = [1, 1.5, 2, 3, 99999]

    #Binning UNDER 2.5 Average Market opening odds
    data['binned odds <2.5 avg'] = pd.cut(data['Avg<2.5'], bins2)

    #Binning Over 2.5 Average Market opening odds
    data['binned odds >2.5 avg'] = pd.cut(data['Avg>2.5'], bins2)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned odds <2.5 pinacle'] = pd.cut(data['P<2.5'], bins2)

    #Binned OVER 2.5 Pinnacle
    data['binned odds >2.5 pinacle'] = pd.cut(data['P>2.5'], bins2)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned odds <2.5 365'] = pd.cut(data['B365<2.5'], bins2)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned odds >2.5 365'] = pd.cut(data['B365>2.5'], bins2)
    
    #----------------------------- Binning Odds Closing ----------------------------------------------------------
    
    #Binning UNDER 2.5 Average Market opening odds
    data['binned odds <2.5 avg closing'] = pd.cut(data['AvgC<2.5'], bins2)

    #Binning Over 2.5 Average Market opening odds
    data['binned odds >2.5 avg closing'] = pd.cut(data['AvgC>2.5'], bins2)

    #Binned UNDER 2.5 Pinnacle opening odds
    data['binned odds <2.5 pinacle closing'] = pd.cut(data['PC<2.5'], bins2)

    #Binned OVER 2.5 Pinnacle
    data['binned odds >2.5 pinacle closing'] = pd.cut(data['PC>2.5'], bins2)

    #Binned UNDER 2.5 bet365 OPENING odds
    data['binned odds <2.5 365 closing'] = pd.cut(data['B365C<2.5'], bins2)

    #Binned OVER 2.5 bet365 OPENING odds
    data['binned odds >2.5 365 closing'] = pd.cut(data['B365C>2.5'], bins2)
    
    
    #----------------------------- Other Features from D3 ------------------------------------------------------
    
    data['Pin_pays_better_under_boolean'] = data['PC<2.5'] > data['AvgC<2.5']
    data['Pin_pays_better_under_difference'] = data['PC<2.5'] / data['AvgC<2.5']
    data['%vig_p'] = (1 - (1 / (1/data['PC>2.5'] + 1/data['PC<2.5'])))*100
    data['%vig_avg'] = (1 - (1 / (1/data['AvgC>2.5'] + 1/data['AvgC<2.5'])))*100
    data['PC<2.5_P_boolean'] = data['PC<2.5'] < data['P<2.5']
    data['PC<2.5_P_relative_diff'] = data['PC<2.5'] / data['P<2.5']
    
    #----------------------- Odds and probability of the home team scoring under 2.5 -------------------------------
    data['Date'] = pd.to_datetime(data['Date'])
    
    lst1 = []
    lst2 = []
    for i, team in enumerate(data['HomeTeam']):
        date = data['Date'].iloc[i]
        total = len(data[(data['HomeTeam'] == team) & (data['Date'] < date)])
        n_under_home = data[(data['HomeTeam'] == team) & (data['Date'] < date)]['under_2.5_goals'].value_counts()
        try:
            lst1.append(1/(n_under_home[1]/total))
            lst2.append(n_under_home[1]/total)
        except:
            lst1.append(np.nan)
            lst2.append(np.nan)

    data['odds_home_under'] = lst1
    data['prob_home_under'] = lst2
    
    
    
    #----------------------- Odds and probability of the away team scoring under 2.5 -------------------------------
    
    lst3 = []
    lst4  = []
    for i, team in enumerate(data['AwayTeam']):
        date = data['Date'].iloc[i]
        total2 = len(data[(data['AwayTeam'] == team) & (data['Date'] < date)])
        n_under_away2 = data[(data['AwayTeam'] == team) & (data['Date'] < date)]['under_2.5_goals'].value_counts()
        try:
            lst3.append(1/(n_under_away2[1] / total2))
            lst4.append(n_under_away2[1] / total2)
        except:
            lst3.append(np.nan)
            lst4.append(np.nan)

    data['odds_away_under'] = lst3
    data['prob_away_under'] = lst4
    
        #----------------------- Odds and probability of the home team scoring over 2.5 -------------------------------
    
    lst5 = []
    lst6 = []
    for i, team in enumerate(data['HomeTeam']):
        date = data['Date'].iloc[i]
        total = len(data[(data['HomeTeam'] == team) & (data['Date'] < date)])
        n_under_home = data[(data['HomeTeam'] == team) & (data['Date'] < date)]['over_2.5_goals'].value_counts()
        try:
            lst5.append(1/(n_under_home[1]/total))
            lst6.append(n_under_home[1]/total)
        except:
            lst5.append(np.nan)
            lst6.append(np.nan)

    data['odds_home_over'] = lst5
    data['prob_home_over'] = lst6
    
     #----------------------- Odds and probability of the away team scoring over 2.5 -------------------------------
    
    lst7 = []
    lst8  = []
    for i, team in enumerate(data['AwayTeam']):
        date = data['Date'].iloc[i]
        total2 = len(data[(data['AwayTeam'] == team) & (data['Date'] < date)])
        n_under_away2 = data[(data['AwayTeam'] == team) & (data['Date'] < date)]['over_2.5_goals'].value_counts()
        try:
            lst7.append(1/(n_under_away2[1] / total2))
            lst8.append(n_under_away2[1] / total2)
        except:
            lst7.append(np.nan)
            lst8.append(np.nan)

    data['odds_away_over'] = lst7
    data['prob_away_over'] = lst8
    
    # -------------------- binning the odds and probability of the home and away teams under 2.5 ----------------------
    if binned:

        #------- Probability -------

        #binning the probability of the home team to have a game of less than 2.5 score
        data['binned prob_home_under'] = pd.cut(data['prob_home_under']*100, bins)

        #binning the probability of the away team to have a game of less than 2.5 score
        data['binned prob_away_under'] = pd.cut(data['prob_away_under']*100, bins)

        #--------- Odds ------------
        binodds = [1, 1.25, 1.42, 1.5, 1.6, 1.8, 2, 2.2, 2.5, 2.8, 3.5, 4, 100]

        #binning the odds of the away team to have a game of less than 2.5 score
        data['binned odds_away_under'] = pd.cut(data['odds_away_under'], binodds)

        #binning the odds of the home team to have a game of less than 2.5 score
        data['binned odds_home_under'] = pd.cut(data['odds_away_under'], binodds)


        # -------------------- binning the odds and probability of the home and away teams over 2.5 ----------------------

        #------- Probability -------

        #binning the probability of the home team to have a game of less than 2.5 score
        data['binned prob_home_over'] = pd.cut(data['prob_home_over']*100, bins)

        #binning the probability of the away team to have a game of less than 2.5 score
        data['binned prob_away_over'] = pd.cut(data['prob_away_over']*100, bins)

        #--------- Odds ------------
        binodds = [1, 1.25, 1.42, 1.5, 1.6, 1.8, 2, 2.2, 2.5, 2.8, 3.5, 4, 100]

        #binning the odds of the away team to have a game of less than 2.5 score
        data['binned odds_away_over'] = pd.cut(data['odds_away_over'], binodds)

        #binning the odds of the home team to have a game of less than 2.5 score
        data['binned odds_home_over'] = pd.cut(data['odds_away_over'], binodds)


    #-------------------------- Creating the prob and odds of the game -----------------------------------------------
    
    #---------------- Under --------------
    '''the mean between the probability of the home team to have a score of under 2.5 and the probability 
    of the away team to do the same'''
    
    data['odds_game_under'] = (data['odds_away_under'] +  data['odds_home_under']) / 2
    data['prob_game_under'] = (data['prob_away_under'] + data['prob_home_under']) / 2
    
    #---------------- Over -------------

    '''the mean between the probability of the home team to have a score of over 2.5 and the probability 
    of the away team to do the same'''
    
    data['odds_game_over'] = (data['odds_away_over'] +  data['odds_home_over']) / 2
    data['prob_game_over'] = (data['prob_away_over'] + data['prob_home_over']) / 2
    
    #-------------------------- OneHotEncoding the binned probabilities columns ------------------------------------------
    
    if binned:
        if b == 5:
            #-------------------- Under -----------------------
            data = data[~data['binned prob_home_under'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_under']])
            bins_encoded = ohe.transform(data[['binned prob_home_under']])
            data["0, 20"], data["20, 40"], data["40, 60"], data["60, 80"], data["80, 100"] = bins_encoded.T

            #-------------------- Over -----------------------
            data = data[~data['binned prob_home_over'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_over']])
            bins_encoded = ohe.transform(data[['binned prob_home_over']])
            data["0, 20"], data["20, 40"], data["40, 60"], data["60, 80"], data["80, 100"] = bins_encoded.T

        if b == 10:
            #-------------------- Under -----------------------
            data = data[~data['binned prob_home_under'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_under']])
            bins_encoded = ohe.transform(data[['binned prob_home_under']])
            data["0, 10"], data["10, 20"], data["20, 30"], data["30, 40"], data["40, 50"], data["50, 60"], \
            data["60, 70"], data["70, 80"], data["80, 90"], data["90, 100"] = bins_encoded.T

            #-------------------- Over -----------------------
            data = data[~data['binned prob_home_over'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_over']])
            bins_encoded = ohe.transform(data[['binned prob_home_over']])
            data["0, 10"], data["10, 20"], data["20, 30"], data["30, 40"], data["40, 50"], data["50, 60"], \
            data["60, 70"], data["70, 80"], data["80, 90"], data["90, 100"] = bins_encoded.T

        if b == 20:
            #-------------------- Under -----------------------
            data = data[~data['binned prob_home_under'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_under']])
            bins_encoded = ohe.transform(data[['binned prob_home_under']])
            data["0, 5"], data["5, 10"], data["10, 15"], data["15, 20"], data["20, 25"], data["25, 30"], \
            data["30, 35"], data["35, 40"], data["40, 45"], data["45, 50"], data["50, 55"], data["55, 60"], \
            data["60, 65"], data["65, 70"], data["70, 75"], data["75, 80"], data["80, 85"], data["85, 90"], \
            data["90, 95"], data["95, 100"]= bins_encoded.T

            #-------------------- Over -----------------------
            data = data[~data['binned prob_home_over'].isna()]
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(data[['binned prob_home_over']])
            bins_encoded = ohe.transform(data[['binned prob_home_over']])
            data["0, 5"], data["5, 10"], data["10, 15"], data["15, 20"], data["20, 25"], data["25, 30"], \
            data["30, 35"], data["35, 40"], data["40, 45"], data["45, 50"], data["50, 55"], data["55, 60"], \
            data["60, 65"], data["65, 70"], data["70, 75"], data["75, 80"], data["80, 85"], data["85, 90"], \
            data["90, 95"], data["95, 100"]= bins_encoded.T

    #-------------------------- OneHotEncoding the binned odds columns ------------------------------------------
    
        #-------------------- Under -----------------------                                       
        data = data[~data['binned odds_away_under'].isna()]
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data[['binned odds_away_under']])
        bins_encoded = ohe.transform(data[['binned odds_away_under']])
        data["1.0, 1.25"], data["1.25, 1.42"], data["1.42, 1.5"], data["1.5, 1.6"],\
        data["1.6, 1.8"], data["1.6, 1.8"], data["1.8, 2.0"], data["2.0, 2.2"], \
        data["2.2, 2.5"], data["2.5, 2.8"], data["2.8, 3.5"], data["3.5, 4.0"] = bins_encoded.T

        #-------------------- Over -----------------------
        data = data[~data['binned odds_away_over'].isna()]
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data[['binned odds_away_over']])
        bins_encoded = ohe.transform(data[['binned odds_away_over']])
        data["1.0, 1.25"], data["1.25, 1.42"], data["1.42, 1.5"], data["1.5, 1.6"],\
        data["1.6, 1.8"], data["1.6, 1.8"], data["1.8, 2.0"], data["2.0, 2.2"], \
        data["2.2, 2.5"], data["2.5, 2.8"], data["2.8, 3.5"], data["3.5, 4.0"] = bins_encoded.T
    
    #------------------------------------ Cleaning the data ---------------------------------------------------------
    
    #data = data.dropna(subset=['HomeTeam', 'AwayTeam'], how='any')
    data = data[~data['HomeTeam'].isna()]
    data = data[~data['AwayTeam'].isna()]
    data = data[~data['PC>2.5'].isna()]
    #data.drop(columns=['Referee', 'Unnamed: 105'], inplace=True)
    #data.dropna()
    
    
    return data

In [7]:
data = feature_engineering(data)

In [8]:
data.shape

(20905, 177)

In [9]:
data['target'] = data['payout_under_2.5_pinacle_closing']
data['payout_under_2.5_pinacle_closing']

0      1.46
1      1.50
2      1.66
3      0.00
4      0.00
       ... 
301    2.25
302    0.00
303    2.19
304    0.00
305    0.00
Name: payout_under_2.5_pinacle_closing, Length: 20905, dtype: float64

In [11]:
data

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,prob_away_under,odds_home_over,prob_home_over,odds_away_over,prob_away_over,odds_game_under,prob_game_under,odds_game_over,prob_game_over,target
0,F1,2020-08-21,18:00,Bordeaux,Nantes,0,0,D,0.0,0.0,...,0.533333,3.000000,0.333333,2.142857,0.466667,1.687500,0.600000,2.571429,0.400000,True
1,F1,2020-08-22,16:00,Dijon,Angers,0,1,A,0.0,1.0,...,0.466667,1.875000,0.533333,1.875000,0.533333,2.142857,0.466667,1.875000,0.533333,True
2,F1,2020-08-22,20:00,Lille,Rennes,1,1,D,1.0,0.0,...,0.800000,1.888889,0.529412,5.000000,0.200000,1.687500,0.635294,3.444444,0.364706,True
3,F1,2020-08-23,12:00,Monaco,Reims,2,2,D,1.0,2.0,...,0.785714,1.363636,0.733333,4.666667,0.214286,2.511364,0.526190,3.015152,0.473810,False
4,F1,2020-08-23,14:00,Lorient,Strasbourg,3,1,H,0.0,1.0,...,0.600000,1.875000,0.533333,2.500000,0.400000,1.904762,0.533333,2.187500,0.466667,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,B1,2021-04-18,17:00,Oostende,Cercle Brugge,1,1,D,0.0,0.0,...,0.451613,1.450000,0.689655,1.823529,0.548387,2.718254,0.380979,1.636765,0.619021,True
302,B1,2021-04-18,17:00,Oud-Heverlee Leuven,Waasland-Beveren,1,2,A,0.0,1.0,...,0.566667,1.875000,0.533333,2.307692,0.433333,1.953782,0.516667,2.091346,0.483333,False
303,B1,2021-04-18,17:00,St Truiden,Anderlecht,0,1,A,0.0,0.0,...,0.444444,2.066667,0.483871,1.800000,0.555556,2.093750,0.480287,1.933333,0.519713,True
304,B1,2021-04-18,17:00,Standard,Beerschot VA,3,0,H,1.0,0.0,...,0.294118,1.933333,0.517241,1.416667,0.705882,2.735714,0.388438,1.675000,0.611562,False


In [12]:
data['target'] = data['target'] > 1

In [13]:
#Creating the features that we will use in the model to predict the under
X = data[['odds_game_over', 'odds_game_under', 'PC<2.5', 'AvgC<2.5','PC>2.5', 'AvgC>2.5', 'Pin_pays_better_under_difference', '%vig_p', '%vig_avg', 'PC<2.5_P_relative_diff']]
y = data['target']

In [14]:
#Scalling our features
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

## Under Model

In [15]:
# Split into Train/Test
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X, y, test_size=0.2)

In [22]:
# fit model in the training data
model_under_p = XGBClassifier(base_score=0.3, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.0009012, max_bin=512, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=3, monotone_constraints='()', n_estimators=100,
              n_jobs=1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=1, reg_lambda=1)
model_under_p.fit(X_train_under, y_train_under)

In [23]:
scores = cross_val_score(model_under_p, X_train_under, y_train_under,cv=10)
print("Mean cross-validation score: %.2f" % scores.mean())

Mean cross-validation score: 1.00


In [24]:
score = model_under_p.score(X_train_under, y_train_under)  
print("Training score: ", score)

Training score:  1.0


In [25]:
y_pred = model_under_p.predict(X_test_under)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
226+137

In [26]:
residual = y_test_under - y_pred
residual.value_counts()

0    4181
Name: target, dtype: int64

In [None]:
plt.scatter(y_test_under, y_pred)

In [None]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=1,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)