In [59]:
import os
import pandas as pd
import numpy as np
from unidecode import unidecode

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# IMPORTS E FUNCOES

In [60]:
def remover_acentos(texto):
    return unidecode(texto)

In [61]:
def convert_to_int(value):
    try:
        return int(value)
    except (ValueError, TypeError):
        return pd.NA

In [62]:
historico = pd.read_excel(f'../datasets/xlsx/historico(naotratado).xlsx')
tabela = pd.read_excel(f'../datasets/xlsx/tabela(naotratado).xlsx')
#rodadas = pd.read_excel(f'datasets/xlsx/rounds/prox_partidas(naotratado).xlsx')

In [63]:
season_atual = historico['season'].max()
rodada_recente = historico[historico['season'] == season_atual]['round'].str.split(' ', expand= True)[1].astype(int).max()

# HISTORICO

### Data cleaning

In [64]:
historico.drop(
  ['notes','captain', 'formation', 'referee', 'match report', 'attendance', 'time'], 
  axis= 'columns', inplace= True)

In [65]:
historico[['gf', 'ga']] = historico[['gf', 'ga']].astype(int)
historico['season'] = historico['season'].astype(int)
historico['round'] = historico['round'].str.split(' ', expand= True)[1].astype(int)

In [66]:
historico['date'] = pd.to_datetime(historico['date'], format='%Y-%m-%d')
historico['days'] = (max(historico['date']) - historico['date']).dt.days
historico['time_diff'] = np.exp(-0.001*historico['days'])

In [67]:
historico['opponent'] = historico['opponent'].apply(remover_acentos)
historico['opponent'] = [linha.lower() for linha in historico['opponent']]
historico['opponent'] = historico['opponent'].str.replace(' ', '_')
historico['opponent'] = historico['opponent'].str.replace('(', '')
historico['opponent'] = historico['opponent'].str.replace(')', '')
historico['opponent'] = historico['opponent'].str.replace(')', '')

# Especifico para o brasileirão
historico['opponent'] = historico['opponent'].str.replace('atletico', 'atl')
historico['team'] = historico['team'].str.replace('athletico', 'ath')
historico['team'] = historico['team'].str.replace('atletico', 'atl')

# Especifico para a Premier League
historico['team'] = historico['team'].str.replace('_hotspur', '')
historico['team'] = historico['team'].str.replace('rhampton_wanderers', 's')
historico['team'] = historico['team'].str.replace('united', 'utd')
historico['team'] = historico['team'].str.replace('west_ham_utd', 'west_ham')
historico['team'] = historico['team'].str.replace('_and_hove_albion', '')
historico['team'] = historico['team'].str.replace('wich_albion', '')
historico['team'] = historico['team'].str.replace('_town', '')
historico['team'] = historico['team'].str.replace('luton', 'luton_town')
historico['opponent'] = historico['opponent'].str.replace('united', 'utd')
historico['opponent'] = historico['opponent'].str.replace("nott'ham_forest", "nottingham_forest")

In [68]:
[item for item in historico['opponent'].unique() if item not in historico['team'].unique()]

[]

In [69]:
historico['team'].unique()

array(['tottenham', 'manchester_city', 'arsenal', 'liverpool',
       'aston_villa', 'newcastle_utd', 'brighton', 'manchester_utd',
       'west_ham', 'chelsea', 'crystal_palace', 'wolves', 'fulham',
       'brentford', 'nottingham_forest', 'everton', 'luton_town',
       'burnley', 'bournemouth', 'sheffield_utd', 'leicester_city',
       'leeds_utd', 'southampton', 'watford', 'norwich_city', 'west_brom',
       'cardiff_city', 'huddersfield', 'swansea_city', 'stoke_city',
       'hull_city', 'middlesbrough', 'sunderland'], dtype=object)

In [70]:
all_teams = pd.unique(historico[['team', 'opponent']].values.ravel('K'))
team_mapping, _ = pd.factorize(all_teams)

historico['home_team_id'] = historico['team'].map(dict(zip(all_teams, team_mapping)))
historico['away_team_id'] = historico['opponent'].map(dict(zip(all_teams, team_mapping)))

In [71]:
historico.reset_index(inplace=True)
historico.rename(columns={'index': 'game_id'}, inplace=True)

In [72]:
historico.head()

Unnamed: 0,game_id,date,comp,round,day,venue,result,gf,ga,opponent,...,xga,poss,sh,sot,season,team,days,time_diff,home_team_id,away_team_id
0,0,2023-08-13,Premier League,1,Sun,Away,D,2,2,brentford,...,2.2,69,18,6,2023,tottenham,75,0.927743,0,13
1,1,2023-08-19,Premier League,2,Sat,Home,W,2,0,manchester_utd,...,2.1,55,17,6,2023,tottenham,69,0.933327,0,7
2,2,2023-08-26,Premier League,3,Sat,Away,W,2,0,bournemouth,...,0.7,56,17,6,2023,tottenham,62,0.939883,0,18
3,3,2023-09-02,Premier League,4,Sat,Away,W,5,2,burnley,...,1.3,53,21,11,2023,tottenham,55,0.946485,0,17
4,4,2023-09-16,Premier League,5,Sat,Home,W,2,1,sheffield_utd,...,0.7,70,28,10,2023,tottenham,41,0.959829,0,19


In [77]:
teams_id = historico[['home_team_id', 'team']].drop_duplicates().reset_index(drop= True)
teams_id.rename(columns={'home_team_id': 'team_id',
                         'team': 'team_name'}, inplace=True)
teams_id['league_id'] = 2
teams_id.head()

Unnamed: 0,team_id,team_name,league_id
0,0,tottenham,2
1,1,manchester_city,2
2,2,arsenal,2
3,3,liverpool,2
4,4,aston_villa,2


### Divisão e Export

In [78]:
teams_id.to_excel(f'../datasets/xlsx/teams_id(tratado).xlsx')

In [74]:
historico.to_excel(f'../datasets/xlsx/historico(tratado).xlsx')

# TABELA

### Data Cleaning

In [51]:
tabela.drop(
  ['attendance', 'top team scorer', 'goalkeeper', 'notes', 'last 5'], 
  axis= 'columns', inplace= True)

In [52]:
tabela['squad'] = tabela['squad'].apply(remover_acentos)
tabela['squad'] = [linha.lower() for linha in tabela['squad']]
tabela['squad'] = tabela['squad'].str.replace(' ', '_')
tabela['squad'] = tabela['squad'].str.replace('(', '')
tabela['squad'] = tabela['squad'].str.replace(')', '')
tabela['squad'] = tabela['squad'].str.replace(')', '')

# Especifico para o brasileirão
tabela['squad'] = tabela['squad'].str.replace('atletico', 'atl')

# Especifico para a Premier League
tabela['squad'] = tabela['squad'].str.replace("united", "utd")
tabela['squad'] = tabela['squad'].str.replace("nott'ham_forest", "nottingham_forest")

In [53]:
[item for item in tabela['squad'].unique() if item not in historico['opponent'].unique()]

['botafogo_rj',
 'bragantino',
 'palmeiras',
 'flamengo',
 'ath_paranaense',
 'gremio',
 'atl_mineiro',
 'fluminense',
 'fortaleza',
 'sao_paulo',
 'internacional',
 'cuiaba',
 'cruzeiro',
 'corinthians',
 'bahia',
 'santos',
 'goias',
 'vasco_da_gama',
 'coritiba',
 'america_mg',
 'ceara',
 'atl_goianiense',
 'avai',
 'juventude',
 'sport_recife',
 'chapecoense',
 'csa',
 'vitoria',
 'parana',
 'ponte_preta',
 'figueirense',
 'santa_cruz',
 'joinville']

In [54]:
historico['opponent'].unique()

array(['Brentford', 'Manchester Utd', 'Bournemouth', 'Burnley',
       'Sheffield Utd', 'Arsenal', 'Liverpool', 'Luton Town', 'Fulham',
       'Crystal Palace', 'Newcastle Utd', 'West Ham', "Nott'ham Forest",
       'Wolves', 'Brighton', 'Everton', 'Tottenham', 'Manchester City',
       'Chelsea', 'Aston Villa', 'Southampton', 'Leicester City',
       'Leeds United', 'Norwich City', 'Watford', 'West Brom',
       'Huddersfield', 'Cardiff City', 'Stoke City', 'Swansea City',
       'Hull City', 'Middlesbrough', 'Sunderland'], dtype=object)

### Feature Engineering

In [98]:
tabela['xg_conv'] = tabela['gf'] - tabela['xg']      #Conversão de xG
tabela['xga_conv'] = tabela['ga'] - tabela['xga']    #Conversão de xGA
tabela['att_rating'] = (tabela['gf']/tabela['mp'])/(tabela['gf']/tabela['mp']).mean()  #Media de gols por partida dividido pelo da liga
tabela['def_rating'] = (tabela['ga']/tabela['mp'])/(tabela['gf']/tabela['mp']).mean()  #Media de gols tomados por partida dividido pelo da liga
tabela['naive_rating'] = tabela['att_rating']*tabela['def_rating']*(tabela['gf']/tabela['mp']).mean()   #att*def*media da liga                                       #sh Total

### Divisões e Exports

In [106]:
tabela.to_excel(f'datasets/xlsx/tabela(tratado).xlsx', index= False)

# RODADAS

### Data Cleaning

In [107]:
rodadas = rodadas.loc[rodadas['home'].notna()]

In [108]:
rodadas.drop(['day', 'date', 'time', 'attendance', 'venue', 'referee', 'match report', 'notes'], axis= 1, inplace= True)

In [109]:
rodadas['home'] = rodadas['home'].apply(remover_acentos)
rodadas['home'] = [linha.lower() for linha in rodadas['home']]
rodadas['home'] = rodadas['home'].str.replace(' ', '_')
rodadas['home'] = rodadas['home'].str.replace('(', '')
rodadas['home'] = rodadas['home'].str.replace(')', '')
rodadas['home'] = rodadas['home'].str.replace(')', '')

# Especifico para o brasileirão
rodadas['home'] = rodadas['home'].str.replace('atletico', 'atl')

# Especifico para a Premier League
rodadas['home'] = rodadas['home'].str.replace("nott'ham_forest", "nottingham_forest")

In [110]:
rodadas['away'] = rodadas['away'].apply(remover_acentos)
rodadas['away'] = [linha.lower() for linha in rodadas['away']]
rodadas['away'] = rodadas['away'].str.replace(' ', '_')
rodadas['away'] = rodadas['away'].str.replace('(', '')
rodadas['away'] = rodadas['away'].str.replace(')', '')
rodadas['away'] = rodadas['away'].str.replace(')', '')

# Especifico para o brasileirão
rodadas['away'] = rodadas['away'].str.replace('atletico', 'atl')

# Especifico para a Premier League
rodadas['away'] = rodadas['away'].str.replace("nott'ham_forest", "nottingham_forest")

In [111]:
[item for item in rodadas['home'].unique() if item not in historico['opponent'].unique()]

[]

In [112]:
rodadas[['gols_casa', 'gols_fora']] = rodadas['score'].str.split('–', expand=True)
rodadas.drop(['score'], axis= 1, inplace= True)
rodadas.insert(5, 'gols_casa', rodadas.pop('gols_casa'))
rodadas.insert(6, 'gols_fora', rodadas.pop('gols_fora'))

rodadas = rodadas.rename(columns={'xg.1': 'xg_fora', 'xg': 'xg_casa'})

rodadas['gols_casa'] = rodadas['gols_casa'].apply(convert_to_int)
rodadas['gols_fora'] = rodadas['gols_fora'].apply(convert_to_int)


In [113]:
rodadas.head()

Unnamed: 0,wk,home,xg_casa,xg_fora,away,gols_casa,gols_fora
0,1.0,america_mg,1.1,3.1,fluminense,0,3
1,1.0,palmeiras,3.0,1.0,cuiaba,2,1
2,1.0,bragantino,2.0,0.7,bahia,2,1
3,1.0,botafogo_rj,0.9,2.0,sao_paulo,2,1
4,1.0,ath_paranaense,0.8,1.0,goias,2,0


In [114]:
rodadas = rodadas[['wk', 'xg_casa', 'xg_fora', 'home', 'gols_casa', 'gols_fora', 'away']]

### Divisão e Export

In [115]:
rodadas.to_excel(f'datasets/{liga}/rounds/all_rounds/rodadas.xlsx', index= False)