In [None]:
import pandas as pd
from datetime import datetime
from sklearn.cluster import KMeans
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

Scripting parameters

In [None]:
starting_season = 1993
ending_season = 2017

POINTS = 0
W = 1
L = 2
D = 3
SCORED = 4
CONCEEDED = 5

basic_statistics = ['HomeTeam','AwayTeam','FTR']
bookies = ['B365','BW','GB','IW','LB','PS','WH','VC','SJ','BS','SB','SO','SY']

Iteratively create a list of all the season of interests.

In [None]:
seasons = []
while starting_season < ending_season:
    seasons.append(str(starting_season)+"-"+str(starting_season+1))
    starting_season+=1

In [None]:
def getTablePositionForTeam(team,table):
    pos = 1
    for index in range(0,len(table)):
        if team in table[index]:
            return pos
        pos+=1
    return -1 # if the team is newly promoted

A list of all the aggregated statistics computed in the next blocks.
* HPSF/APSD: Home Points So Fat (Away)
* HWSF/AWSF: Home Win So Far (Away)
* HLSF/ALSF: Home Lost So Far (Away)
* HDSF/ADSF: Home Draw So Far ( Away)
* HSSF/ASSF: Home Scored So Far ( Away)
* HCSF/ACSF: Home Conceeded So Far (Away)


In [None]:
added_statistics = ['Season','MatchDay',"HomeOddsAvg","AwayOddsAvg","DrawOddsAvg",'HomeTeamLastYear',
                         'AwayTeamLastYear','HPSF','APSF','HWSF','AWSF','HLSF','ALSF','HDSF','ADSF',
                   'HSSF','ASSF','HCSF','ACSF']

# NOTE: these statistics are about the behavior of the home/away team in the last N matches.
# Some trials prove that aren't useful in the analysis, but can be uncommented and created in the next block
# modifying the N variable.

#added_statistics.append("HomeTeamLast1")
#added_statistics.append("AwayTeamLast1")
#added_statistics.extend(("HomeTeamLast1","HomeTeamLast2","HomeTeamLast3"))
#added_statistics.extend(("AwayTeamLast1","AwayTeamLast2","AwayTeamLast3"))

Initializing a structure that will be keep track of the total number of games won by every team in the last two decades. This will be used to create a unique index to assing to each team. The lower the index and the more the team is prestigious.

In [None]:
total_wins = {}

Importing 1992-1993 hardcoded table

In [None]:
table = [('Milan', [50, 18, 2, 14, 65, 32]),
 ('Inter', [46, 17, 5, 12, 59, 36]),
 ('Parma', [41, 16, 4, 9, 47, 34]),
 ('Juventus', [39, 15, 10, 9, 59, 47]),
 ('Lazio', [38, 13, 10, 12, 65, 51]),
 ('Cagliari', [37, 14, 11, 9, 45, 33]),
 ('Sampdoria', [36, 12, 10, 12, 50, 48]),
 ('Atalanta', [36, 14, 12, 8, 42, 44]),
 ('Torino', [35, 9, 11, 17, 38, 38]),
 ('Roma', [33, 8, 14, 17, 42, 39]),
 ('Napoli', [32, 10, 12, 12, 49, 50]),
 ('Foggia', [32, 10, 12, 12, 39, 55]),
 ('Genoa', [31, 7, 10, 17, 41, 55]),
 ('Udinese', [30, 10, 14, 10, 42, 48]),
 ('Brescia', [30, 9, 13, 12, 36, 44]),
 ('Fiorentina', [30, 8, 14, 9, 53, 56]),
 ('Ancona', [19, 6, 21, 7, 39, 73]),
 ('Pescara', [17, 6, 23, 5, 47, 75])]

In [None]:
def fromDateToMillis(Date):
    date_obj = datetime.strptime(Date,"%d/%m/%y")
    return date_obj.timestamp()

POINTS_PER_WIN = 2 #only for 1993-1994 season
index_season = 1

df = pd.DataFrame()

for season in seasons:
    
    season_df = pd.read_csv("../input/"+season+".csv")
    season_df["Season"] = index_season

    season_df['_Season'] = season #only for readability purposes

    teams = season_df['HomeTeam'].unique() #list of all the teams for a given season
    n_matchdays = (len(teams)-1)*2 #directly computed based on the number of teams
    
    # MATCHDAY FEATURE CREATION
    # Being the MatchDay feature not available in the original dataset this informations is automatically created
    # clustering matches on dates and assigning an incremental number to each set.
    # If a match has been postponed and played in a different date it is merged to the closest matchday.
    # This outcome is acceptable (if not desirable) for our purposes.
    
    season_df['DateMillis'] = season_df['Date'].apply(fromDateToMillis)
    kmeans = KMeans(n_clusters=n_matchdays, random_state=0)
    kmeans.fit(season_df['DateMillis'].values.reshape(-1,1))
    res = kmeans.labels_
    out = []
    day = 1
    for i in range(0,len(res)-1):
        out.append(day)
        if(res[i] != res[i+1]):
            day+=1
    out.append(day)
    season_df["MatchDay"] = out

    # ODDS AVG CREATION
    # Adding betting odds of all the available bookies
    # A feature will be added containing an average of all the odds availables for the 3 results
    
    home_odds = []
    away_odds = []
    draw_odds = []

    for index, row in season_df.iterrows():
        n_bookies = 0
        # COMPUTING ODD AVGs (if any, otherwise 1)
        home_avg = 0
        away_avg = 0
        draw_avg = 0
        for bookie in bookies:
            if (bookie+'H') in row.index and str(row[bookie+'H']) != "nan":
                n_bookies+=1
                home_avg += row[bookie+'H']
                away_avg += row[bookie+'A']
                draw_avg += row[bookie+'D']
        if n_bookies != 0:
            home_odds.append(home_avg / n_bookies)
            away_odds.append(away_avg / n_bookies)
            draw_odds.append(draw_avg / n_bookies)
        else:
            home_odds.append(1)
            away_odds.append(1)
            draw_odds.append(1)
                  
    season_df["HomeOddsAvg"] = home_odds
    season_df["AwayOddsAvg"] = away_odds
    season_df["DrawOddsAvg"] = draw_odds
    
    # PREVIOUS YEAR POSITION
    # adding informations on the previous table position of the two teams
    season_df["HomeTeamLastYear"] = season_df['HomeTeam'].apply(getTablePositionForTeam,
                                                                args=(table,))
    season_df["AwayTeamLastYear"] = season_df['AwayTeam'].apply(getTablePositionForTeam,
                                                               args=(table,))
    
    # COMPUTING the "table" for the next season + incremental for this one
    
    # initializing the data structure
    
    incremental_tables = {}
    table = {}
    
    for m in range(1,n_matchdays+1):
        struct = {}
        for i in range(0,len(teams)):
            table[teams[i]] = [0,0,0,0,0,0]
            struct[teams[i]] = [0,0,0,0,0,0]
        incremental_tables[m] = struct 

    for index, row in season_df.iterrows():
        
        incremental_tables[row["MatchDay"]][row["HomeTeam"]][SCORED] =  table[row['HomeTeam']][SCORED]
        incremental_tables[row["MatchDay"]][row["HomeTeam"]][CONCEEDED] =  table[row['HomeTeam']][CONCEEDED]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][SCORED] =  table[row['AwayTeam']][SCORED]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][CONCEEDED] =  table[row['AwayTeam']][CONCEEDED]

        incremental_tables[row["MatchDay"]][row["HomeTeam"]][POINTS] =  table[row['HomeTeam']][POINTS]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][POINTS] =  table[row['AwayTeam']][POINTS]

        incremental_tables[row["MatchDay"]][row["HomeTeam"]][W] =  table[row['HomeTeam']][W]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][W] =  table[row['AwayTeam']][W]
        incremental_tables[row["MatchDay"]][row["HomeTeam"]][L] =  table[row['HomeTeam']][L]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][L] =  table[row['AwayTeam']][L]
        incremental_tables[row["MatchDay"]][row["HomeTeam"]][D] =  table[row['HomeTeam']][D]
        incremental_tables[row["MatchDay"]][row["AwayTeam"]][D] =  table[row['AwayTeam']][D]
    
        table[row['HomeTeam']][SCORED] += row['FTHG']
        table[row['HomeTeam']][CONCEEDED] += row['FTAG']

        table[row['AwayTeam']][SCORED] += row['FTAG']
        table[row['AwayTeam']][CONCEEDED] += row['FTHG']

        if row['FTR'] == 'H':
            table[row['HomeTeam']][POINTS]+=POINTS_PER_WIN
            table[row['HomeTeam']][W]+=1
            table[row['AwayTeam']][L]+=1
        elif row['FTR'] == 'D':
            table[row['HomeTeam']][POINTS]+=1
            table[row['AwayTeam']][POINTS]+=1
            table[row['HomeTeam']][D]+=1
            table[row['AwayTeam']][D]+=1
        else:
            table[row['AwayTeam']][POINTS]+=POINTS_PER_WIN
            table[row['HomeTeam']][L]+=1
            table[row['AwayTeam']][W]+=1
            
    # updating the list of wins for all the teams in history!
    
    for team in table:
        if team in total_wins.keys():
            total_wins[team] += table[team][W]
        else:
            total_wins[team] = table[team][W]
            
    table = sorted(table.items(), key=lambda x: x[1], reverse=True)
    
    # CREATING ADVANCED INDECES on points, results and goals SO FAR earned/scored
    
    # Team (H/A) Points So Far
    
    season_df['HPSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][POINTS],axis=1)
    season_df['APSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][POINTS],axis=1)
    
    # Team(H/A) Wins So Far
    season_df['HWSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][W],axis=1)
    season_df['AWSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][W],axis=1)
    
    # Team(H/A) Lost So Far
    season_df['HLSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][L],axis=1)
    season_df['ALSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][L],axis=1)
    
    # Team(H/A) Draw So Far
    season_df['HDSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][D],axis=1)
    season_df['ADSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][D],axis=1)
    
    # Team (H/A) Scored So Far
    
    season_df['HSSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][SCORED],axis=1)
    season_df['ASSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][SCORED],axis=1)
    
    # Team (H/A) Conceeded So Far
    
    season_df['HCSF'] = season_df[['MatchDay','HomeTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][CONCEEDED],axis=1)
    season_df['ACSF'] = season_df[['MatchDay','AwayTeam']].apply(lambda x: incremental_tables[x[0]][x[1]][CONCEEDED],axis=1)
    
    # adding informations on the points gained in last N matches
    WIN = 3
    DRAW = 1
    LOST = 0
    MISSING = -1

    def getLastResultFor(row,hoa,df,last):

        team = row['HomeTeam'] if hoa == 'H' else row['AwayTeam']
        matchday = row['MatchDay']

        res = df.query(
            "(HomeTeam == '{0}' or AwayTeam == '{0}') and MatchDay == {1}".format(team,int(matchday-last)))

        if len(res) > 0:
            res = res['FTR'].tolist()[0]
            if res == 'H':
                return WIN
            elif res == 'A':
                return LOST
            else:
                return DRAW
        return MISSING
    
    # ************************************************
    # HOW MANY LAST GAMES ARE OF INTEREST?
    
    N = 1 # 1 =  no interest in last games
    for i in range(1,N+1):
        season_df["HomeTeamLast"+str(i)] = season_df.apply(getLastResultFor,
                                                  args=("H",season_df,i,),axis=1)
        #added_statistics.append("HomeTeamLast"+str(i))
        season_df["AwayTeamLast"+str(i)] = season_df.apply(getLastResultFor,
                                                             args=("A",season_df,i,),axis=1)
        #added_statistics.append("AwayTeamLast"+str(i))
    
    # ************************************************
    
    POINTS_PER_WIN = 3
    index_season+=1
    
    df = df.append(season_df[['_Season']+ basic_statistics + added_statistics],
                   ignore_index=True)

In [None]:
# THIS IS A GOOD SPOT TO SEE THE DATASET df
df.tail()

Creating the conversion team->index based on prestige

In [None]:
wins = sorted(total_wins.items(), key=lambda x: x[1], reverse=True)
ord_wins = []
for item in wins:
    ord_wins.append(item[0])
    
# these two teams had never been in SERIE A until 17-18
ord_wins.append("SPAL")
ord_wins.append("Benevento")

In [None]:
# converting team names to indeces (converted_df is much less funny to read then df)
converted_df = df.copy()
converted_df['HomeTeam'] = df['HomeTeam'].apply(lambda x: (ord_wins.index(x) + 1))
converted_df['AwayTeam'] = df['AwayTeam'].apply(lambda x: (ord_wins.index(x) + 1))
converted_df.drop(['_Season'],inplace=True,axis=1)

** The DBO intuition (Different by Odds)**

A good line of research might be to search if the actual result of a match
is different from the result predicted by the odds (1 if different, 0 otherwise)

In [None]:
differentByOdds = []
predictedByOdds = ""
for index, row in converted_df.iterrows():
    
    if row['HomeOddsAvg'] < row['AwayOddsAvg'] and row['HomeOddsAvg'] < row['DrawOddsAvg']:
        predictedByOdds = 'H'
    if row['DrawOddsAvg'] < row['AwayOddsAvg'] and row['DrawOddsAvg'] < row['HomeOddsAvg']:
        predictedByOdds = 'D'
    if row['HomeOddsAvg'] > row['AwayOddsAvg'] and row['AwayOddsAvg'] < row['DrawOddsAvg']:
        predictedByOdds = 'A'
        
    if predictedByOdds != row['FTR']:
        differentByOdds.append(1)
    else:
        differentByOdds.append(0)
        
converted_df['DBO'] = differentByOdds

In [None]:
targetAll = converted_df[df['Season'] <= 24]['FTR']
target9316 = converted_df[df['Season'] < 24]['FTR']
target1617 = converted_df[df['Season'] == 24]['FTR'] 

targetAllDBO = converted_df[df['Season'] <= 24]['DBO']
target9316DBO = converted_df[df['Season'] < 24]['DBO']
target1617DBO = converted_df[df['Season'] == 24]['DBO'] 

In [None]:
df_norm = converted_df.drop(['FTR','DBO'],axis=1).copy()#.append(pred_df,ignore_index=True)
# normalizing the data on all the data available
norm_all = df_norm.copy()

In [None]:
norm_all = norm_all.join(pd.get_dummies(norm_all['HomeTeam'],prefix="Home")).join(pd.get_dummies(norm_all['AwayTeam'],prefix="Away"))
norm_all.drop(['HomeTeam','AwayTeam'],inplace=True,axis=1)

In [None]:
norm_all = (norm_all - norm_all.min()) / (norm_all.max() - norm_all.min())

**Models Creation**


The performances will be assessed testing on the whole 16-17 season using all the previous seasons as training.

In [None]:
df_train = norm_all[df_norm['Season'] < 24]
df_test = norm_all[df_norm['Season'] == 24]

In [None]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier #tanh - 175
from sklearn.svm import SVC,LinearSVC #C 0.75
from sklearn.ensemble import AdaBoostClassifier

The first classifier will try to asses when a match ends with a different result with respect to the one foreseen by the odds.

In [None]:
xgDBO = XGBClassifier()
xgDBO.fit(df_train, target9316DBO)
predictionsDBO = xgDBO.predict(df_test)
print("How many times a match has ended in a different way than predicted\nand the classifier has been able to detect it? %.2f %%" 
      % (accuracy_score(target1617DBO,predictionsDBO)*100,) )     

The second will trivially try to predict the match result.

In [None]:
xg = XGBClassifier()
xg.fit(df_train, target9316)
predictions = xg.predict(df_test)
print("How many times the classifier has been able to predict the correct result? %.2f %%"
      %(accuracy_score(target1617,predictions)*100,))           

The accuracy result as is is not so relevant; we want to see how many times the classificator gives a prediction that is different by the one given by the odds.

In [None]:
i = 0
totalPred = 0
totalActual = 0
predictedDBO = 0

for index, row in season_df.iterrows():
    if row['HomeOddsAvg'] < row['AwayOddsAvg'] and row['HomeOddsAvg'] < row['DrawOddsAvg']:
        predictedByOdds = 'H'
    if row['DrawOddsAvg'] < row['AwayOddsAvg'] and row['DrawOddsAvg'] < row['HomeOddsAvg']:
        predictedByOdds = 'D'
    if row['HomeOddsAvg'] > row['AwayOddsAvg'] and row['AwayOddsAvg'] < row['DrawOddsAvg']:
        predictedByOdds = 'A'
        
    if predictions[i] != predictedByOdds:
        totalPred += 1
        
    if target1617.iloc[i] != predictedByOdds:
        totalActual += 1
        predictedDBO += predictionsDBO[i]
        
    i+=1
    
print("#times when the favorite result hasn't been predicted: %.2f %%" %(totalPred*100/i,))
print("For a phenomenon that actually happens: %.2f %%" %(totalActual*100/i,))
print("This relevant phenomenon has been predicted: %.2f %% it has happened." %(predictedDBO*100/totalActual,))

Trying to predict the most probable predicted results that isn't the one predicted by the odds when the DBO predictors say so.

In [None]:
def indexByResult(res):
    if res == 'A':
        return 0
    elif res == 'D':
        return 1
    else:
        return 2

# different thresholds used to keep the same prediction of the odds even if the DBOClassifier does not agree.
# A lower threshold means that we trust the classifier (0=complete trust, DBO useless)

treshes = [0,0.5,0.75,0.9]
best_t = 0
best_acc = 0

predictions = xg.predict(df_test)
pred_probs = xg.predict_proba((df_test))

predictionsDBO = xgDBO.predict(df_test)

for t in treshes:
    
    custompreds = []
    i = 0    
    for index, row in df_test.iterrows():
        if row['HomeOddsAvg'] < row['AwayOddsAvg'] and row['HomeOddsAvg'] < row['DrawOddsAvg']:
            predictedByOdds = 'H'
        if row['DrawOddsAvg'] < row['AwayOddsAvg'] and row['DrawOddsAvg'] < row['HomeOddsAvg']:
            predictedByOdds = 'D'
        if row['HomeOddsAvg'] > row['AwayOddsAvg'] and row['AwayOddsAvg'] < row['DrawOddsAvg']:
            predictedByOdds = 'A'

        # DBO doesn't say anything, go with the normal prediction (probably the same of the odds)
        if predictedByOdds == predictions[i] and predictionsDBO[i] == 0:
            # THIS KIND OF ERRORS (if any) ARE IMPOSSIBLE TO ELIMINATE!
            custompreds.append(predictions[i])
            
        elif predictedByOdds != predictions[i] and predictionsDBO[i] == 1:
            custompreds.append(predictions[i]) # the two predictors agree!
            
        elif predictedByOdds == predictions[i] and predictionsDBO[i] == 1:
            if pred_probs[i][indexByResult(predictions[i])] > t:
                # the predictor is too sure to be the same
                custompreds.append(predictions[i])
            else:
                # change the prediction using the highest of the other 2
                res_probs = []
                for r in range(0,3):
                    if r != indexByResult(predictions[i]):
                        res_probs.append(pred_probs[i][r])
                winning_prob = res_probs[0] if res_probs[0] > res_probs[1] else res_probs[1]
                winning_index = pred_probs[i].tolist().index(winning_prob)
                if winning_index == 0:
                    custompreds.append('A')
                elif winning_index == 1:
                    custompreds.append('D')
                else:
                    custompreds.append('H')
        else: #predictedByOdds != predictions[i] and predictionsDBO[i] == 0:
            # even if DBO doesn't say to change go with the prediction of the classifier even if DBO
            # (another approach may be ok)
            custompreds.append(predictions[i])

        i += 1

    if accuracy_score(target1617,custompreds) > best_acc:
        best_acc = accuracy_score(target1617,custompreds)
        best_t = t
        
print(str(best_acc) + " with t="+str(best_t))

Final outcome of this Kernel is that you cannot become rich betting on football.