In [2]:
import os
import pandas as pd
import numpy as np
from unidecode import unidecode

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
league = {'brasileirao_a': ['Brasileirao'],
          'premier_league': ['Premier_League'],}


liga = league['brasileirao_a'][0]

# IMPORTS E FUNCOES

In [4]:
def remover_acentos(texto):
    return unidecode(texto)

In [5]:
def convert_to_int(value):
    try:
        return int(value)
    except (ValueError, TypeError):
        return pd.NA

In [6]:
historico = pd.read_excel(f'datasets/{liga}/match_history/historico-2015a2023(naotratada).xlsx')
tabela = pd.read_excel(f'datasets/{liga}/standings/tabela-2015a2023(naotratada).xlsx')
rodadas = pd.read_excel(f'datasets/{liga}/rounds/prox_partidas(naotratada).xlsx')

In [7]:
season_atual = historico['season'].max()
rodada_recente = historico[historico['season'] == season_atual]['round'].str.split(' ', expand= True)[1].astype(int).max()

# HISTORICO

### Data cleaning

In [8]:
historico.drop(
  ['notes','captain', 'formation', 'referee', 'match report', 'attendance', 'time'], 
  axis= 'columns', inplace= True)

In [9]:
historico[['gf', 'ga']] = historico[['gf', 'ga']].astype(int)
historico['season'] = historico['season'].astype(int)
historico['round'] = historico['round'].str.split(' ', expand= True)[1].astype(int)

In [10]:
historico['date'] = pd.to_datetime(historico['date'], format='%Y-%m-%d')
historico['days'] = (max(historico['date']) - historico['date']).dt.days
historico['time_diff'] = np.exp(-0.001*historico['days'])

In [11]:
historico['opponent'] = historico['opponent'].apply(remover_acentos)
historico['opponent'] = [linha.lower() for linha in historico['opponent']]
historico['opponent'] = historico['opponent'].str.replace(' ', '_')
historico['opponent'] = historico['opponent'].str.replace('(', '')
historico['opponent'] = historico['opponent'].str.replace(')', '')
historico['opponent'] = historico['opponent'].str.replace(')', '')

# Especifico para o brasileirão
historico['opponent'] = historico['opponent'].str.replace('atletico', 'atl')
historico['team'] = historico['team'].str.replace('athletico', 'ath')
historico['team'] = historico['team'].str.replace('atletico', 'atl')

# Especifico para a Premier League
historico['team'] = historico['team'].str.replace('_hotspur', '')
historico['team'] = historico['team'].str.replace('rhampton_wanderers', 's')
historico['team'] = historico['team'].str.replace('united', 'utd')
historico['team'] = historico['team'].str.replace('west_ham_utd', 'west_ham')
historico['team'] = historico['team'].str.replace('_and_hove_albion', '')
historico['team'] = historico['team'].str.replace('wich_albion', '')
historico['team'] = historico['team'].str.replace('_town', '')
historico['team'] = historico['team'].str.replace('luton', 'luton_town')
historico['opponent'] = historico['opponent'].str.replace('united', 'utd')
historico['opponent'] = historico['opponent'].str.replace("nott'ham_forest", "nottingham_forest")

In [12]:
[item for item in historico['opponent'].unique() if item not in historico['team'].unique()]

[]

In [13]:
historico['team'].unique()

array(['botafogo_rj', 'bragantino', 'palmeiras', 'flamengo',
       'ath_paranaense', 'gremio', 'atl_mineiro', 'fluminense',
       'fortaleza', 'sao_paulo', 'internacional', 'cuiaba', 'cruzeiro',
       'corinthians', 'bahia', 'santos', 'goias', 'vasco_da_gama',
       'coritiba', 'america_mg', 'ceara', 'atl_goianiense', 'avai',
       'juventude', 'sport_recife', 'chapecoense', 'csa', 'vitoria',
       'parana', 'ponte_preta', 'figueirense', 'santa_cruz', 'joinville'],
      dtype=object)

In [14]:
historico.head()

Unnamed: 0,date,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,sh,sot,season,team,days,time_diff
0,2023-04-15,Série A,1,Sat,Home,W,2,1,sao_paulo,0.9,2.0,33.0,12.0,3.0,2023,botafogo_rj,194,0.823658
1,2023-04-24,Série A,2,Mon,Away,W,2,1,bahia,0.2,0.9,42.0,5.0,3.0,2023,botafogo_rj,185,0.831104
2,2023-04-30,Série A,3,Sun,Away,W,3,2,flamengo,1.9,3.1,26.0,11.0,5.0,2023,botafogo_rj,179,0.836106
3,2023-05-07,Série A,4,Sun,Home,W,2,0,atl_mineiro,2.0,0.5,39.0,22.0,7.0,2023,botafogo_rj,172,0.841979
4,2023-05-11,Série A,5,Thu,Home,W,3,0,corinthians,2.6,0.6,53.0,15.0,5.0,2023,botafogo_rj,168,0.845354


### Divisão e Export

Todos

In [15]:
historico.to_excel(f'datasets/{liga}/match_history/all_games/historico-2015a2023.xlsx', index=False)

In [16]:
for season in historico['season'].unique():
  if not os.path.exists(f'datasets/{liga}/match_history/all_games/season-{season}'):
    os.makedirs(f'datasets/{liga}/match_history/all_games/season-{season}')
  df = historico[historico['season'] == season]
  df.to_excel(f'datasets/{liga}/match_history/all_games/season-{season}/historico.xlsx', index= False)

Casa

In [17]:
historico_casa = historico[historico['venue'] == 'Home']
historico_casa.to_excel(f'datasets/{liga}/match_history/home_only/historico-2015a2023.xlsx', index=False)

In [18]:
for season in historico_casa['season'].unique():
  if not os.path.exists(f'datasets/{liga}/match_history/home_only/season-{season}'):
    os.makedirs(f'datasets/{liga}/match_history/home_only/season-{season}')
  df = historico_casa[historico_casa['season'] == season]
  df.to_excel(f'datasets/{liga}/match_history/home_only/season-{season}/historico.xlsx', index= False)

Por Time

In [19]:
for time in historico['team'].unique():
  df = historico[historico['team'] == time]
  df.to_excel(f'datasets/{liga}/match_history/teams/{time}.xlsx', index= False)

# TABELA

### Data Cleaning

In [20]:
tabela.drop(
  ['attendance', 'top team scorer', 'goalkeeper', 'notes', 'last 5'], 
  axis= 'columns', inplace= True)

In [21]:
tabela['squad'] = tabela['squad'].apply(remover_acentos)
tabela['squad'] = [linha.lower() for linha in tabela['squad']]
tabela['squad'] = tabela['squad'].str.replace(' ', '_')
tabela['squad'] = tabela['squad'].str.replace('(', '')
tabela['squad'] = tabela['squad'].str.replace(')', '')
tabela['squad'] = tabela['squad'].str.replace(')', '')

# Especifico para o brasileirão
tabela['squad'] = tabela['squad'].str.replace('atletico', 'atl')

# Especifico para a Premier League
tabela['squad'] = tabela['squad'].str.replace("united", "utd")
tabela['squad'] = tabela['squad'].str.replace("nott'ham_forest", "nottingham_forest")

In [22]:
[item for item in tabela['squad'].unique() if item not in historico['opponent'].unique()]

[]

In [23]:
historico['opponent'].unique()

array(['sao_paulo', 'bahia', 'flamengo', 'atl_mineiro', 'corinthians',
       'goias', 'fluminense', 'america_mg', 'ath_paranaense', 'fortaleza',
       'cuiaba', 'palmeiras', 'vasco_da_gama', 'gremio', 'bragantino',
       'santos', 'coritiba', 'cruzeiro', 'internacional', 'botafogo_rj',
       'ceara', 'juventude', 'atl_goianiense', 'avai', 'sport_recife',
       'chapecoense', 'csa', 'parana', 'vitoria', 'ponte_preta',
       'santa_cruz', 'figueirense', 'joinville'], dtype=object)

### Feature Engineering

In [24]:
tabela['xg_conv'] = tabela['gf'] - tabela['xg']      #Conversão de xG
tabela['xga_conv'] = tabela['ga'] - tabela['xga']    #Conversão de xGA
tabela['att_rating'] = (tabela['gf']/tabela['mp'])/(tabela['gf']/tabela['mp']).mean()  #Media de gols por partida dividido pelo da liga
tabela['def_rating'] = (tabela['ga']/tabela['mp'])/(tabela['gf']/tabela['mp']).mean()  #Media de gols tomados por partida dividido pelo da liga
tabela['naive_rating'] = tabela['att_rating']*tabela['def_rating']*(tabela['gf']/tabela['mp']).mean()   #att*def*media da liga

for time in tabela['squad']:
  #xG
  tabela.loc[tabela['squad'] == time, 'xg_casa'] = historico_casa[historico_casa['team'] == time]['xg'].mean()        #xG em casa
  tabela.loc[tabela['squad'] == time, 'xg_fora'] = historico_casa[historico_casa['opponent'] == time]['xga'].mean()   #xG fora
  tabela.loc[tabela['squad'] == time, 'xg_total'] = (tabela['xg_casa'] + tabela['xg_fora'])/2                         #xG total

  #xGA
  tabela.loc[tabela['squad'] == time, 'xga_casa'] = historico_casa[historico_casa['team'] == time]['xga'].mean()      #xGA em casa
  tabela.loc[tabela['squad'] == time, 'xga_fora'] = historico_casa[historico_casa['opponent'] == time]['xg'].mean()   #xGA fora
  tabela.loc[tabela['squad'] == time, 'xga_total'] = (tabela['xga_casa'] + tabela['xga_fora'])/2                      #xGA total

  #GF
  tabela.loc[tabela['squad'] == time, 'gf_casa'] = historico_casa[historico_casa['team'] == time]['gf'].mean()        #GF em casa
  tabela.loc[tabela['squad'] == time, 'gf_fora'] = historico_casa[historico_casa['opponent'] == time]['ga'].mean()    #GF fora
  tabela.loc[tabela['squad'] == time, 'gf_total'] = (tabela['gf_casa'] + tabela['gf_fora'])/2                         #GF total

  #GA
  tabela.loc[tabela['squad'] == time, 'ga_casa'] = historico_casa[historico_casa['team'] == time]['ga'].mean()        #GA em casa
  tabela.loc[tabela['squad'] == time, 'ga_fora'] = historico_casa[historico_casa['opponent'] == time]['gf'].mean()    #GA fora
  tabela.loc[tabela['squad'] == time, 'ga_total'] = (tabela['ga_casa'] + tabela['ga_fora'])/2                         #GA total

  #Poss
  tabela.loc[tabela['squad'] == time, 'poss_casa'] = historico[(historico['team'] == time) & (historico['venue'] == 'Home')]['poss'].mean() #Poss Casa
  tabela.loc[tabela['squad'] == time, 'poss_fora'] = historico[(historico['team'] == time) & (historico['venue'] == 'Away')]['poss'].mean() #Poss Fora
  tabela.loc[tabela['squad'] == time, 'poss_total'] = (tabela['poss_casa'] + tabela['poss_fora'])/2                                         #Poss Total
  
  #SoT
  tabela.loc[tabela['squad'] == time, 'sot_casa'] = historico[(historico['team'] == time) & (historico['venue'] == 'Home')]['sot'].mean() #sot Casa
  tabela.loc[tabela['squad'] == time, 'sot_fora'] = historico[(historico['team'] == time) & (historico['venue'] == 'Away')]['sot'].mean() #sot Fora
  tabela.loc[tabela['squad'] == time, 'sot_total'] = (tabela['sot_casa'] + tabela['sot_fora'])/2                                          #sot Total

  #Sh
  tabela.loc[tabela['squad'] == time, 'sh_casa'] = historico[(historico['team'] == time) & (historico['venue'] == 'Home')]['sh'].mean() #sh Casa
  tabela.loc[tabela['squad'] == time, 'sh_fora'] = historico[(historico['team'] == time) & (historico['venue'] == 'Away')]['sh'].mean() #sh Fora
  tabela.loc[tabela['squad'] == time, 'sh_total'] = (tabela['sh_casa'] + tabela['sh_fora'])/2                                           #sh Total


In [25]:
#Ultimos 5 jogos
historico_5jogos = historico_casa[historico_casa['round'] > rodada_recente-5]

for time in tabela['squad']:
  #xG
  tabela.loc[tabela['squad'] == time, 'xg_5casa'] = historico_5jogos[historico_5jogos['team'] == time]['xg'].mean()       #xG em casa
  tabela.loc[tabela['squad'] == time, 'xg_5fora'] = historico_5jogos[historico_5jogos['opponent'] == time]['xga'].mean()  #xG fora
  tabela.loc[tabela['squad'] == time, 'xg_5total'] = (tabela['xg_5casa'] + tabela['xg_5fora'])/2                          #xG total

  #xGA
  tabela.loc[tabela['squad'] == time, 'xga_5casa'] = historico_5jogos[historico_5jogos['team'] == time]['xga'].mean()     #xGA em casa
  tabela.loc[tabela['squad'] == time, 'xga_5fora'] = historico_5jogos[historico_5jogos['opponent'] == time]['xg'].mean()  #xGA fora
  tabela.loc[tabela['squad'] == time, 'xga_5total'] = (tabela['xga_5casa'] + tabela['xga_5fora'])/2                       #xGA total

  #GF
  tabela.loc[tabela['squad'] == time, 'gf_5casa'] = historico_5jogos[historico_5jogos['team'] == time]['gf'].mean()       #GF em casa
  tabela.loc[tabela['squad'] == time, 'gf_5fora'] = historico_5jogos[historico_5jogos['opponent'] == time]['ga'].mean()   #GF fora
  tabela.loc[tabela['squad'] == time, 'gf_5total'] = tabela['gf_5casa'] + tabela['gf_5fora']                              #GF total

  #GA
  tabela.loc[tabela['squad'] == time, 'ga_5casa'] = historico_5jogos[historico_5jogos['team'] == time]['ga'].mean()       #GA em casa
  tabela.loc[tabela['squad'] == time, 'ga_5fora'] = historico_5jogos[historico_5jogos['opponent'] == time]['gf'].mean()   #GA fora
  tabela.loc[tabela['squad'] == time, 'ga_5total'] = tabela['ga_5casa'] + tabela['ga_5fora']                              #GA total
  

### Divisões e Exports

In [26]:
tabela_atual = tabela[tabela['season'] == 2023]

In [27]:
for season in tabela['season'].unique()[1:]:
  if not os.path.exists(f'datasets/{liga}/standings/all_standings/season-{season}'):
    os.makedirs(f'datasets/{liga}/standings/all_standings/season-{season}')
  df = tabela[tabela['season'] == season]
  df.to_excel(f'datasets/{liga}/standings/all_standings/season-{season}/tabela.xlsx', index= False)


if not os.path.exists(f'datasets/{liga}/standings/all_standings/season-{season_atual}'):
  os.makedirs(f'datasets/{liga}/standings/all_standings/season-{season_atual}')
tabela_atual.to_excel(f'datasets/{liga}/standings/all_standings/season-{season_atual}/tabela-rodada{rodada_recente}.xlsx', index= False)

### Tabelas casa e fora

In [28]:
historico_atual = historico[historico['season'] == season_atual]
tabela_casa = []
tabela_fora = []
for time in historico_atual['team'].unique():
  tabela_casa.append(sum(historico_atual[historico_atual['venue'] == 'Home'].loc[historico_atual['team'] == time]['result'].map({'W': 3, 'D': 1, 'L': 0})))
  tabela_fora.append(sum(historico_atual[historico_atual['venue'] == 'Away'].loc[historico_atual['team'] == time]['result'].map({'W': 3, 'D': 1, 'L': 0})))

In [29]:
tabela_atual['pts_casa'] = tabela_casa
tabela_atual['pts_fora'] = tabela_fora

In [30]:
tabela_fora = tabela_atual.sort_values(by= 'pts_fora', ascending= False, ignore_index= True)
tabela_fora['rk'] = range(1,21)

In [31]:
tabela_casa = tabela_atual.sort_values(by= 'pts_casa', ascending= False, ignore_index= True)
tabela_casa['rk'] = range(1,21)

In [32]:
if not os.path.exists(f'datasets/{liga}/standings/home_standings/season-{season_atual}'):
  os.makedirs(f'datasets/{liga}/standings/home_standings/season-{season_atual}')
if not os.path.exists(f'datasets/{liga}/standings/away_standings/season-{season_atual}'):
  os.makedirs(f'datasets/{liga}/standings/away_standings/season-{season_atual}')

tabela_casa.to_excel(f'datasets/{liga}/standings/home_standings/season-{season_atual}/tabela-rodada{rodada_recente}.xlsx', index= False)
tabela_fora.to_excel(f'datasets/{liga}/standings/away_standings/season-{season_atual}/tabela-rodada{rodada_recente}.xlsx', index= False)

# RODADAS

### Data Cleaning

In [33]:
rodadas = rodadas.loc[rodadas['home'].notna()]

In [34]:
rodadas.drop(['day', 'date', 'time', 'attendance', 'venue', 'referee', 'match report', 'notes'], axis= 1, inplace= True)

In [35]:
rodadas['home'] = rodadas['home'].apply(remover_acentos)
rodadas['home'] = [linha.lower() for linha in rodadas['home']]
rodadas['home'] = rodadas['home'].str.replace(' ', '_')
rodadas['home'] = rodadas['home'].str.replace('(', '')
rodadas['home'] = rodadas['home'].str.replace(')', '')
rodadas['home'] = rodadas['home'].str.replace(')', '')

# Especifico para o brasileirão
rodadas['home'] = rodadas['home'].str.replace('atletico', 'atl')

# Especifico para a Premier League
rodadas['home'] = rodadas['home'].str.replace("nott'ham_forest", "nottingham_forest")

In [36]:
rodadas['away'] = rodadas['away'].apply(remover_acentos)
rodadas['away'] = [linha.lower() for linha in rodadas['away']]
rodadas['away'] = rodadas['away'].str.replace(' ', '_')
rodadas['away'] = rodadas['away'].str.replace('(', '')
rodadas['away'] = rodadas['away'].str.replace(')', '')
rodadas['away'] = rodadas['away'].str.replace(')', '')

# Especifico para o brasileirão
rodadas['away'] = rodadas['away'].str.replace('atletico', 'atl')

# Especifico para a Premier League
rodadas['away'] = rodadas['away'].str.replace("nott'ham_forest", "nottingham_forest")

In [37]:
[item for item in rodadas['home'].unique() if item not in historico['opponent'].unique()]

[]

In [38]:
rodadas[['gols_casa', 'gols_fora']] = rodadas['score'].str.split('–', expand=True)
rodadas.drop(['score'], axis= 1, inplace= True)
rodadas.insert(5, 'gols_casa', rodadas.pop('gols_casa'))
rodadas.insert(6, 'gols_fora', rodadas.pop('gols_fora'))

rodadas = rodadas.rename(columns={'xg.1': 'xg_fora', 'xg': 'xg_casa'})

rodadas['gols_casa'] = rodadas['gols_casa'].apply(convert_to_int)
rodadas['gols_fora'] = rodadas['gols_fora'].apply(convert_to_int)


In [39]:
rodadas.head()

Unnamed: 0,wk,home,xg_casa,xg_fora,away,gols_casa,gols_fora
0,1.0,america_mg,1.1,3.1,fluminense,0,3
1,1.0,palmeiras,3.0,1.0,cuiaba,2,1
2,1.0,bragantino,2.0,0.7,bahia,2,1
3,1.0,botafogo_rj,0.9,2.0,sao_paulo,2,1
4,1.0,ath_paranaense,0.8,1.0,goias,2,0


In [40]:
rodadas = rodadas[['wk', 'xg_casa', 'xg_fora', 'home', 'gols_casa', 'gols_fora', 'away']]

### Divisão e Export

In [41]:
rodadas.to_excel(f'datasets/{liga}/rounds/all_rounds/rodadas.xlsx', index= False)