# Predict match winners: Poisson

https://www.kdnuggets.com/2023/01/python-machine-learning-predict-football-match-winners.html

In [134]:
from operator import itemgetter

import pandas as pd
from scipy.stats import poisson

In [3]:
training_set = pd.read_parquet("datasets/processed/training_set_processed.parquet")
training_set

Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Goals
7102,00-01,CHN1,2000-03-19,Dalian Haichang,Beijing Guoan,2,0,2,W,2
7103,00-01,CHN1,2000-03-19,Qingdao,Shanghai Greenland,1,1,0,D,2
7108,00-01,CHN1,2000-03-19,Xiamen Xiaxin,Liaoning,3,0,3,W,3
7106,00-01,CHN1,2000-03-19,Shenzhen FC,Sichuan Guancheng,0,0,0,D,0
7105,00-01,CHN1,2000-03-19,Guangzhou Rich and Force,Chongqing Lifan,0,0,0,D,0
...,...,...,...,...,...,...,...,...,...,...
299041,22-23,TUN1,2023-02-08,EO Sidi Bouzid,ES Metlaoui,3,4,-1,L,7
299040,22-23,TUN1,2023-02-08,ES Sahel,CA Bizertin,0,2,-2,L,2
299039,22-23,TUN1,2023-02-08,US Ben Guerdane,Soliman,2,2,0,D,4
299037,22-23,TUN1,2023-02-08,Stade Tunisien,US Tataouine,2,1,1,W,3


In [181]:
with open("teams.pickle", "rb") as f:
    teams_stats_dict = pickle.load(f)
#teams_stats_dict

In [198]:

def predict_game(home_team: str, away_team: str):
    ''' Predicts game outcome in terms of exact scores (HS/AS) and 
    also WDL probabilities, given two teams in training set (team stats 
    should be available to compute game prediction).
    '''

    # Calculate the value of lambda (λ) for both Home Team and Away Team.
    ht_stats_dict = [t for t in teams if t["name"] == home_team][0]
    at_stats_dict = [t for t in teams if t["name"] == away_team][0]
    
    avg_ht_scored = ht_stats_dict["avg_goals_scored"]
    avg_ht_conceded = ht_stats_dict["avg_goals_against"]
    avg_at_scored = at_stats_dict["avg_goals_scored"]
    avg_at_conceded = at_stats_dict["avg_goals_against"]
    
    lambda_home_team = avg_ht_scored * avg_at_conceded
    lambda_away_team = avg_at_scored * avg_ht_conceded
    
    pr_home, pr_away, pr_draw = 0, 0, 0
    result_probs = list()
    
    for x in range(0,14): #number of goals home team (max14)
        for y in range(0, 14): #number of goals away team (max14)
            p = poisson.pmf(x, lambda_home_team) * poisson.pmf(y, lambda_away_team)
            result_probs.append((x,y,p))
            # if p > 0.01: print(f"Result: {x}-{y}, Prob: {p}") 
            if x == y:
                pr_draw += p
            elif x > y:
                pr_home += p
            else:
                pr_away += p
    
    HS, AS, P = max(result_probs, key=itemgetter(2))           
    # print(f"{home_team}: {pr_home}")
    # print(f"Draw: {pr_draw}")
    # print(f"{away_team}: {pr_away}")
    # print(f"Most probable result: {HS}-{AS} ({P})")
                 
    # print(lambda_home_team, lambda_away_team)
    points_home_team = 3 * pr_home + pr_draw
    points_away_team = 3 * pr_away + pr_draw
    
    return HS, AS, pr_home, pr_draw, pr_away

def fill_predictions(test_set: pd.DataFrame):
    ''' Fills pd.DF with games as rows in terms of predicted exact 
    scores (pr_HS/prd_AS) and prd_WDL probabilities.
    '''
    for i, game in test_set.iterrows():
        
        home_team = game.HT
        away_team = game.AT
        try:
            HS, AS, W, D, L = predict_game(home_team, away_team)
        except:
            continue #ignore games with teams not present in training set
        
        test_set.at[i,"pr_HS"] = HS
        test_set.at[i,"prd_AS"] = AS
        
        test_set.at[i, "prd_W"] = W
        test_set.at[i, "prd_D"] = D
        test_set.at[i, "prd_L"] = L
        #print(HS, AS, W, D, L)
        
    return test_set
        
    

In [199]:
HS, AS, W, D, L = predict("Real Sociedad", "Elche")
print(HS, AS, W, D, L)

1 1 0.4285002255616886 0.2340073026753857 0.337492467286655


### Fill predictions in Prediction Set

In [200]:
test_set = pd.read_excel("datasets/raw/PredictionSet_2023_01_31.xlsx")
test_set

Unnamed: 0,Lge,Sea,Date,HT,AT,HS,AS,GD,WDL,pr_HS,prd_AS,prd_W,prd_D,prd_L
0,ARG1,23-24,16/04/2023,CA Banfield,Central Cordoba,-1,-1,0,D,-1,-1,0,0,0
1,ARG1,23-24,16/04/2023,Defensa y Justicia,Instituto de Córdoba,-1,-1,0,D,-1,-1,0,0,0
2,ARG1,23-24,16/04/2023,Velez Sarsfield,Barracas Central,-1,-1,0,D,-1,-1,0,0,0
3,ARG1,23-24,16/04/2023,Platense,CA Colon,-1,-1,0,D,-1,-1,0,0,0
4,ARG1,23-24,16/04/2023,Talleres Cordoba,San Lorenzo,-1,-1,0,D,-1,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,ZAF1,22-23,28/04/2023,Moroka Swallows,Maritzburg United,-1,-1,0,D,-1,-1,0,0,0
626,ZAF1,22-23,29/04/2023,Tshakhuma Tsha Madzivhandila,SuperSport United,-1,-1,0,D,-1,-1,0,0,0
627,ZAF1,22-23,29/04/2023,Stellenbosch,Golden Arrows,-1,-1,0,D,-1,-1,0,0,0
628,ZAF1,22-23,29/04/2023,Mamelodi Sundowns,Richards Bay FC,-1,-1,0,D,-1,-1,0,0,0


In [194]:
def fill_predictions(test_set):
    for i, game in test_set.iterrows():
        
        home_team = game.HT
        away_team = game.AT
        try:
            HS, AS, W, D, L = predict_game(home_team, away_team)
        except:
            continue #ignore games with teams not present in training set
        
        test_set.at[i,"pr_HS"] = HS
        test_set.at[i,"prd_AS"] = AS
        
        test_set.at[i, "prd_W"] = W
        test_set.at[i, "prd_D"] = D
        test_set.at[i, "prd_L"] = L
        #print(HS, AS, W, D, L)
        
    return test_set

In [195]:
fill_predictions(test_set)

Unnamed: 0,Lge,Sea,Date,HT,AT,HS,AS,GD,WDL,pr_HS,prd_AS,prd_W,prd_D,prd_L
0,ARG1,23-24,16/04/2023,CA Banfield,Central Cordoba,-1,-1,0,D,1,1,0.456825,0.245911,0.297264
1,ARG1,23-24,16/04/2023,Defensa y Justicia,Instituto de Córdoba,-1,-1,0,D,0,0,0.000000,0.367879,0.632121
2,ARG1,23-24,16/04/2023,Velez Sarsfield,Barracas Central,-1,-1,0,D,1,1,0.511057,0.226284,0.262659
3,ARG1,23-24,16/04/2023,Platense,CA Colon,-1,-1,0,D,1,1,0.392273,0.253332,0.354395
4,ARG1,23-24,16/04/2023,Talleres Cordoba,San Lorenzo,-1,-1,0,D,1,1,0.330920,0.244354,0.424726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,ZAF1,22-23,28/04/2023,Moroka Swallows,Maritzburg United,-1,-1,0,D,1,1,0.411401,0.268286,0.320313
626,ZAF1,22-23,29/04/2023,Tshakhuma Tsha Madzivhandila,SuperSport United,-1,-1,0,D,0,1,0.195545,0.284021,0.520434
627,ZAF1,22-23,29/04/2023,Stellenbosch,Golden Arrows,-1,-1,0,D,1,1,0.372532,0.280040,0.347427
628,ZAF1,22-23,29/04/2023,Mamelodi Sundowns,Richards Bay FC,-1,-1,0,D,0,0,0.391319,0.358574,0.250107


In [206]:
print(predict_game("Valladolid", "Athletic Bilbao"))
print(predict_game("Almeria", "Cadiz"))
print(predict_game("Vallecano", "Girona"))
print(predict_game("Espanyol Barcelona", "Celta de Vigo"))
print(predict_game("Athletico Madrid", "Valencia"))
print(predict_game("Real Betis", "Mallorca"))
print(predict_game("Osasuna", "Villarreal"))
print(predict_game("Real Sociedad", "Elche"))
print(predict_game("Getafe", "Sevilla FC"))
print(predict_game("FC Barcelona", "Real Madrid"))

(1, 1, 0.38062116877455004, 0.2305995728709916, 0.38877925304683375)
(1, 1, 0.3640946156100736, 0.25268029495124505, 0.3832250887303924)
(1, 1, 0.3153525923616397, 0.2233038085036607, 0.46134358509850815)
(1, 1, 0.33151592540621877, 0.22913394249755192, 0.4393501247202423)
(1, 1, 0.45261263946263003, 0.22812392354308272, 0.31926342830929005)
(1, 1, 0.4319115160173203, 0.22786159296718142, 0.3402268830547751)
(1, 1, 0.2614570357226383, 0.22651639185456282, 0.512026560179152)
(1, 1, 0.4285002255616886, 0.2340073026753857, 0.337492467286655)
(1, 1, 0.26715617756721016, 0.22486231019715744, 0.5079814981580819)
(2, 2, 0.4254376154420775, 0.19771185588540796, 0.37685036285361084)


In [202]:
from utils import get_teams_in_league

get_teams_in_league(training_set, "SPA1")

Distinct leagues in set: 51


['Santander',
 'Malaga',
 'Valencia',
 'Real Sociedad',
 'Granada',
 'Getafe',
 'Hercules',
 'Albacete',
 'FC Barcelona',
 'Athletico Madrid',
 'La Coruna',
 'Numancia',
 'Las Palmas',
 'Levante',
 'Cadiz',
 'Xerez',
 'Zaragoza',
 'Almeria',
 'Athletic Bilbao',
 'Real Madrid',
 'Huesca',
 'Real Betis',
 'Elche',
 'Leganes',
 'Celta de Vigo',
 'Oviedo',
 'Valladolid',
 'Alaves',
 'Gimnastic Tarragona',
 'Girona',
 'Espanyol Barcelona',
 'Eibar',
 'Vallecano',
 'Recreativo',
 'Sporting Gijon',
 'Villarreal',
 'Osasuna',
 'Sevilla FC',
 'Mallorca',
 'Murcia',
 'Tenerife',
 'Cordoba']

### Evaluate predictions

HS, AS: with RMSE

W, D, L probs: ranked probability score (RPS)