In [600]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

from scipy import stats

In [601]:
league = {'brasileirao_a': ['Brasileirao'],
          'premier_league': ['Premier_League'],}


liga = league['brasileirao_a'][0]

In [602]:
historico = pd.read_excel(f'../datasets/{liga}/match_history/all_games/historico-2015a2023.xlsx')
rodadas = pd.read_excel(f'../datasets/{liga}/rounds/all_rounds/rodadas.xlsx')
tabela = pd.read_excel(f'../datasets/{liga}/standings/all_standings/season-2023/tabela-rodada29.xlsx')
#tabela = pd.read_excel(f'../datasets/{liga}/standings/all_standings/season-2023/tabela-rodada10.xlsx')

In [603]:
historico['date'] = pd.to_datetime(historico['date'], format='%Y-%m-%d')
historico['days'] = (max(historico['date']) - historico['date']).dt.days
historico['time_diff'] = np.exp(-0.001*historico['days'])

In [604]:
historico = historico[historico['season'] == 2023]

In [605]:
train = historico[historico['date'] < '2023-10-01'].dropna()
test = rodadas[rodadas['wk'] > 24].dropna()

#train = historico[historico['date'] < '2023-09-15'].dropna()
#test = rodadas[rodadas['wk'] > 4].dropna()

In [606]:
class Modelagem:
  # Fit no modelo
  def __init__(self, formula, data, max_goals= 5, output= None):
    self.modelo = smf.glm(formula= formula, data= data, family= sm.families.Poisson()).fit()
    #self.modelo = smf.glm(formula= formula, data= data, family= sm.families.NegativeBinomial()).fit()
    self.max_goals = max_goals
    if output is not None:
      print(self.modelo.summary())

  # Gerar Lambda
  def lambd(self, team, opponent, venue, stat, fator_casa= None):
    if fator_casa is True:
      string = 'casa' if venue == 'Home' else 'fora'
    else:
      string = 'total'

    data = {
        'team': team,
        'opponent': opponent,
        'venue': venue,
        'xg': float(stat[f'xg_{string}']),
        'xga': float(stat[f'xga_{string}']),
        'poss': float(stat[f'poss_{string}']),
        'sot': float(stat[f'sot_{string}']),
        'sh': float(stat[f'sh_{string}']),
        'time_diff': 1
    }
    return self.modelo.predict(pd.DataFrame(data, index=[0])).values[0]

  # Matriz de gols
  def matrix(self, lambda_x, lambda_y):
    prob_x = [stats.poisson.pmf(i, lambda_x) for i in range(0, self.max_goals+1)]
    prob_y = [stats.poisson.pmf(i, lambda_y) for i in range(0, self.max_goals+1)]

    return np.outer(prob_x, prob_y)
  
  # Função Final de Previsao de rodadas
  def predict_scores(self, matches, stats):
    for index, match in matches.iterrows():
      lambda_x = self.lambd(match['home'], match['away'], 'Home', stats.loc[stats['squad'] == match['home']], fator_casa= True)
      lambda_y = self.lambd(match['away'], match['home'], 'Away', stats.loc[stats['squad'] == match['away']], fator_casa= True)
      matrix_scores = self.matrix(lambda_x, lambda_y)
      
      matches.loc[index, 'vit_casa'] = np.sum(np.tril(matrix_scores, -1))
      matches.loc[index, 'empate'] = np.sum(np.diag(matrix_scores))
      matches.loc[index, 'vit_fora'] = np.sum(np.triu(matrix_scores, 1))
    
    return matches[['wk','xg_casa','xg_fora','home','gols_casa','gols_fora','away','vit_casa','empate','vit_fora']]
  
  # Teste acuracia
  def accuracy_score(self, matches):
    for index, match in matches.iterrows():
      #Gols
      if match['gols_casa'] > match['gols_fora']:
        matches.loc[index, 'resultado1'] = 'casa'
      elif match['gols_fora'] > match['gols_casa']:
        matches.loc[index, 'resultado1'] = 'fora'
      else:
        matches.loc[index, 'resultado1'] = 'empate'
      
      #Probabilidade
      if match['vit_casa'] > match['vit_fora'] and match['vit_casa'] > match['empate']:
        matches.loc[index, 'resultado2'] = 'casa'
      elif match['vit_fora'] > match['vit_casa'] and match['vit_fora'] > match['empate']:
        matches.loc[index, 'resultado2'] = 'fora'
      else:
        matches.loc[index, 'resultado2'] = 'empate'

    #Conclusao
    for index, match in matches.iterrows():
      if match['resultado1'] == match['resultado2']:
        matches.loc[index, 'resultado'] = 'acerto'
      else:
        matches.loc[index, 'resultado'] = 'fracasso'

    matches = matches.drop(['resultado1', 'resultado2'], axis= 1)
    score = matches['resultado'].value_counts()['acerto']/len(matches['resultado'])

    return matches, score
        

In [607]:
train.columns

Index(['date', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'sh', 'sot', 'season', 'team', 'days',
       'time_diff'],
      dtype='object')

In [608]:
#model = Modelagem(formula= 'gf ~ team*venue*poss*xg*sh + time_diff + opponent', data= train) #
model = Modelagem(formula= 'gf ~ team*xg*venue + opponent', data= train, output= True) #

                 Generalized Linear Model Regression Results                  
Dep. Variable:                     gf   No. Observations:                  490
Model:                            GLM   Df Residuals:                      391
Model Family:                 Poisson   Df Model:                           98
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -575.12
Date:                Tue, 31 Oct 2023   Deviance:                       337.87
Time:                        18:26:49   Pearson chi2:                     286.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.3928
Covariance Type:            nonrobust                                         
                                              coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

In [609]:
pred = model.predict_scores(test, tabela)

In [610]:
pred, score = model.accuracy_score(pred)

In [611]:
score

0.5102040816326531

In [612]:
pred.head()

Unnamed: 0,wk,xg_casa,xg_fora,home,gols_casa,gols_fora,away,vit_casa,empate,vit_fora,resultado
240,25,1.9,1.5,fortaleza,1.0,1.0,gremio,0.354924,0.289982,0.353082,fracasso
241,25,1.4,0.5,flamengo,1.0,0.0,bahia,0.699961,0.222443,0.070342,acerto
242,25,1.2,0.2,cuiaba,3.0,0.0,fluminense,0.492291,0.286766,0.218491,acerto
243,25,2.3,0.2,sao_paulo,2.0,1.0,corinthians,0.509146,0.267492,0.219314,acerto
244,25,0.5,0.9,coritiba,2.0,0.0,ath_paranaense,0.173123,0.273834,0.54966,fracasso
