In [None]:
import time
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Leagues

1 - Brasileirao A \
2 - Premier League \
3 - Serie A TIM \
4 - Bundesliga \
5 - La Liga

In [None]:
ligas = {
    'brasileirao_a': ["Série A", "https://fbref.com/en/comps/24/Serie-A-Stats", 'Brasileirao', 'https://fbref.com/en/comps/24/schedule/Serie-A-Scores-and-Fixtures'],
    'premier_league': ["Premier League", "https://fbref.com/en/comps/9/Premier-League-Stats", 'Premier_League', 'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures'],
    'serie_a_tim': ["Serie A", "https://fbref.com/en/comps/11/Serie-A-Stats", 'Serie_A_TIM', "https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures"],
    'bundesliga': ["Bundesliga", "https://fbref.com/en/comps/20/Bundesliga-Stats", 'Bundesliga', "https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures"],
    'la_liga': ["La Liga", "https://fbref.com/en/comps/12/La-Liga-Stats", 'La_Liga', "https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures"],
    'ligue_1': ["Ligue 1", "https://fbref.com/en/comps/13/Ligue-1-Stats", 'Ligue_1', "https://fbref.com/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures"]
}


anos = list(range(2023, 2014, -1))
#anos.pop(3)

# Funcoes

In [None]:
def match_history(league, years = [2023], update = None):
  print("Getting the seasons: ", years)
  liga = league[1]
  match_history = []
  per = 0
  for year in years:
    print("Scraping the season: ", year)
    data = requests.get(liga)
    soup = BeautifulSoup(data.text)
    tabela = soup.select('table.stats_table')[0]            #Seleciona a tabela principal
    links = tabela.find_all('a')                            #Procura a Anchor que contem todos os links do time
    links = [link.get('href') for link in links]            #Pega os links (sem o começo deles)
    links = [link for link in links if '/squads/' in link]  #Pega apenas o link 'squads'
    urls = [f"https://fbref.com{link}" for link in links]   #Adiciona o inicio do html

    try:
      prev_season = soup.select('a.prev')[0].get('href')      #Vai para a temporada anterior
      liga = f"https://fbref.com{prev_season}"
    except IndexError:
      print(f'***Erro para o ano: {year}')
      continue

    perx = 0
    #Itera sobre todos os times da tabela
    for team in urls:
      nome_time = team.split('/')[-1].replace('-Stats', '').replace('-','_').lower()
      per+=1
      perx+=1
      percent_total = round((per/(len(urls)*len(years)))*100, 2)
      print(f'-> {percent_total}% ({perx}/{len(urls)} {nome_time})')
      data = requests.get(team)
      soup = BeautifulSoup(data.text)

      ##Partidas jogadas
      matches = pd.read_html(data.text, match= 'Scores & Fixtures')[0]

      try:
        ##Chutes
        links = [link.get("href") for link in soup.find_all('a')]
        links = [link for link in links if link and 'all_comps/shooting/' in link]
        data_shooting = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data_shooting.text, match= "Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        matches_df = matches.merge(
          shooting[['Date', 'Sh', 'SoT']], on= 'Date')
        time.sleep(2)
      except (ValueError, IndexError):
        pass

      try:
        ##Goleiros
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/keeper' in l]
        data_goalkeeping = requests.get(f"https://fbref.com{links[0]}")
        goalkeeping = pd.read_html(data_goalkeeping.text, match= "Goalkeeping")[0]
        goalkeeping.columns = goalkeeping.columns.droplevel()
        matches_df = matches_df.merge(
          goalkeeping[['Date', 'Saves']], on= 'Date')
        time.sleep(2)
      except (ValueError, IndexError):
        pass
      
      try:
        ##Passes
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/passing' in l]
        data_passing = requests.get(f"https://fbref.com{links[0]}")
        passing = pd.read_html(data_passing.text, match= "Passing")[0]
        passing.columns = passing.columns.droplevel()
        matches_df = matches_df.merge(
          passing[['Date', 'Cmp', 'Att', 'PrgP', 'KP', '1/3']], on= 'Date')
        matches_df.rename(columns={'1/3': 'pass_3rd'}, inplace=True)
        time.sleep(2)
      except (ValueError, IndexError):
        pass
      
      try:
        ##Tipos de Passes
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/passing_types' in l]
        data_passtype = requests.get(f"https://fbref.com{links[0]}")
        pass_types = pd.read_html(data_passtype.text, match= "Pass Types")[0]
        pass_types.columns = pass_types.columns.droplevel()
        matches_df = matches_df.merge(
          pass_types[['Date', 'Sw', 'Crs']], on= 'Date')
        time.sleep(2)
      except (ValueError, IndexError):
        pass

      try:
        ##Gols e Criação de Chutes
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/gca' in l]
        data_shotcreation = requests.get(f"https://fbref.com{links[0]}")
        goal_shotcreation = pd.read_html(data_shotcreation.text, match= "Goal and Shot Creation")[0]
        goal_shotcreation.columns = goal_shotcreation.columns.droplevel()
        matches_df = matches_df.merge(
          goal_shotcreation[['Date', 'SCA', 'GCA']], on= 'Date')
        time.sleep(2)
      except (ValueError, IndexError):
        pass

      try:
        ##Acoes Defensivas
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/defense' in l]
        data_defensive = requests.get(f"https://fbref.com{links[0]}")
        defensive = pd.read_html(data_defensive.text, match= "Defensive Actions")[0]
        defensive.columns = defensive.columns.droplevel()
        matches_df = matches_df.merge(
          defensive[['Date', 'Tkl', 'TklW', 'Def 3rd', 'Att 3rd', 'Blocks', 'Int']], on= 'Date')
        matches_df.rename(columns={'Att 3rd': 'Tkl_Att_3rd',
                                   'Def 3rd': 'Tkl_Def_3rd'}, inplace=True)
        time.sleep(2)
      except (ValueError, IndexError):
        pass

      try:
        ##Posse de Bola
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/possession' in l]
        data_possession = requests.get(f"https://fbref.com{links[0]}")
        possession = pd.read_html(data_possession.text, match= "Possession")[0]
        possession.columns = possession.columns.droplevel()
        matches_df = matches_df.merge(
          possession[['Date', 'Att 3rd']], on= 'Date')
        matches_df.rename(columns={'Att 3rd': 'Touches_Att_3rd'}, inplace=True)
        time.sleep(2)
      except (ValueError, IndexError):
        pass

      try:
        ##Demais Estatisticas
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/misc' in l]
        data_misc = requests.get(f"https://fbref.com{links[0]}")
        misc = pd.read_html(data_misc.text, match= "Miscellaneous Stats")[0]
        misc.columns = misc.columns.droplevel()
        if 'Recov' not in misc.columns:
          misc['Recov'] = np.nan
        matches_df = matches_df.merge(
          misc[['Date', 'Fls', 'Off', 'Recov']], on= 'Date')
        time.sleep(2)
      except (ValueError, IndexError):
        pass


      matches_df = matches_df[matches_df["Comp"] == league[0]]
      matches_df['Season'] = year
      matches_df['Team'] = nome_time
      
      match_history.append(matches_df)
      time.sleep(4)
    
  print(match_history[0].columns)
  print(match_history[19].columns)
  match_history = pd.concat(match_history)
  match_history.columns = [c.lower() for c in match_history.columns]
  if update is not None:
    update.drop(update[update['season'] == year].index, inplace= True)
    update.columns = [c.lower() for c in update.columns]
    match_history = pd.concat([match_history, update], axis = 0)

  return match_history

In [None]:
def standings(league, years= [2023], update = None):
  table = []
  liga = league[1]
  print("Getting the seasons: ", years)
  for year in years:
    print("Scraping the season: ", year)
    data = requests.get(liga)
    soup = BeautifulSoup(data.text)
    table_df = pd.read_html(data.text, match= 'Regular season')[0]
    table_df['Season'] = year
    table_df['league_name'] = league[2]
  
    table.append(table_df)
    time.sleep(10)

    try:
      prev_season = soup.select('a.prev')[0].get('href')      #Vai para a temporada anterior
      liga = f"https://fbref.com{prev_season}"
    except IndexError:
      print('*Erro no ano: ', year)
      continue
  
  table = pd.concat(table)
  table.columns = [c.lower() for c in table.columns]
  if update is not None:
    update.drop(update[update['season'] == year].index, inplace= True)
    update.columns = [c.lower() for c in update.columns]
    table = pd.concat([table, update], axis = 0)

  return table

In [None]:
def next_matches(league):
  data = requests.get(league[3])
  matches = pd.read_html(data.text, match= 'Scores & Fixtures')[0]
  matches['league_name'] = league[2]
  matches.columns = [c.lower() for c in matches.columns]
  return matches

In [None]:
def get_squads(league, years = [2023], update = None):
  print("Getting the seasons: ", years)
  liga = league[1]
  squads = []
  per = 0
  for year in years:
    print("Scraping the season: ", year)
    data = requests.get(liga)
    soup = BeautifulSoup(data.text)
    tabela = soup.select('table.stats_table')[0]            #Seleciona a tabela principal
    links = tabela.find_all('a')                            #Procura a Anchor que contem todos os links do time
    links = [link.get('href') for link in links]            #Pega os links (sem o começo deles)
    links = [link for link in links if '/squads/' in link]  #Pega apenas o link 'squads'
    urls = [f"https://fbref.com{link}" for link in links]   #Adiciona o inicio do html

    try:
      prev_season = soup.select('a.prev')[0].get('href')      #Vai para a temporada anterior
      liga = f"https://fbref.com{prev_season}"
    except IndexError:
      print(f'* Erro para o ano: {year}')
      continue

    #Itera sobre todos os times da tabela
    perx = 0
    for team in urls:
      nome_time = team.split('/')[-1].replace('-Stats', '').replace('-','_').lower()
      per+=1
      perx+=1
      percent_total = round((per/(len(urls)*len(years)))*100, 2)
      print(f'-> {percent_total}% ({perx}/{len(urls)} {nome_time})')
      data = requests.get(team)
      soup = BeautifulSoup(data.text)

      ##Elenco
      squads_df = pd.read_html(data.text, match= 'Standard Stats')[0]
      squads_df.columns = squads_df.columns.droplevel()

      squads_df['Season'] = year
      squads_df['Team'] = nome_time
      squads_df['League'] = league[2]
      squads_df = squads_df.iloc[:-2]
      try:
        coach = str(soup.find_all('p')[6])
        squads_df['coach'] = coach.split('<')[3].split('> ')[1]
      except IndexError:
        squads_df['coach'] = None
      squads_df = squads_df.loc[:,~squads_df.columns.duplicated(keep= 'first')]
      squads.append(squads_df)
      time.sleep(5)

  squads = pd.concat(squads)
  squads.columns = [c.lower() for c in squads.columns]
  if update is not None:
    update.drop(update[update['season'] == year].index, inplace= True)
    update.columns = [c.lower() for c in update.columns]
    squads = pd.concat([squads, update], axis = 0)

  return squads

# Exports

### Baixar anos anteriores:

In [99]:
historico = []
historico.append(match_history(ligas['la_liga'], years= [2023, 2022]))
historico.append(match_history(ligas['ligue_1'], years= anos))

historico = pd.concat(historico)

historico_incomp = pd.read_excel(f'../datasets/xlsx/raw/historico.xlsx')
historico1 = pd.concat([historico_incomp, historico])
historico1.to_excel('../datasets/xlsx/raw/historico.xlsx')

-> 5.0% (2/20 real_madrid)
-> 7.5% (3/20 barcelona)
-> 10.0% (4/20 atletico_madrid)
-> 12.5% (5/20 athletic_club)
-> 15.0% (6/20 real_sociedad)
-> 17.5% (7/20 real_betis)
-> 20.0% (8/20 las_palmas)
-> 22.5% (9/20 valencia)
-> 25.0% (10/20 rayo_vallecano)
-> 27.5% (11/20 getafe)
-> 30.0% (12/20 osasuna)
-> 32.5% (13/20 sevilla)
-> 35.0% (14/20 villarreal)
-> 37.5% (15/20 alaves)
-> 40.0% (16/20 cadiz)
-> 42.5% (17/20 mallorca)
-> 45.0% (18/20 celta_vigo)
-> 47.5% (19/20 granada)
-> 50.0% (20/20 almeria)
Scraping the season:  2022
-> 52.5% (1/20 barcelona)
-> 55.0% (2/20 real_madrid)
-> 57.5% (3/20 atletico_madrid)
-> 60.0% (4/20 real_sociedad)
-> 62.5% (5/20 villarreal)
-> 65.0% (6/20 real_betis)
-> 67.5% (7/20 osasuna)
-> 70.0% (8/20 athletic_club)
-> 72.5% (9/20 mallorca)
-> 75.0% (10/20 girona)
-> 77.5% (11/20 rayo_vallecano)
-> 80.0% (12/20 sevilla)
-> 82.5% (13/20 celta_vigo)
-> 85.0% (14/20 cadiz)
-> 87.5% (15/20 getafe)
-> 90.0% (16/20 valencia)
-> 92.5% (17/20 almeria)
-> 95.0% 

KeyboardInterrupt: 

In [None]:
historico = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  historico.append(match_history(ligas[liga], years= anos))

historico = pd.concat(historico)

historico.to_excel(f'../datasets/xlsx/raw/historico.xlsx', index= False)

In [None]:
elenco = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  elenco.append(get_squads(ligas[liga], years= anos))

elenco = pd.concat(elenco)

elenco.to_excel(f'../datasets/xlsx/raw/elenco.xlsx', index= False)

In [None]:
tabela = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  tabela.append(standings(ligas[liga], years= anos))

tabela = pd.concat(tabela)

tabela.to_excel(f'../datasets/xlsx/raw/tabela.xlsx', index= False)

In [None]:
rodadas = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  rodadas.append(next_matches(ligas[liga]))

rodadas = pd.concat(rodadas)

rodadas.to_excel(f'../datasets/xlsx/raw/rodadas.xlsx', index= False)

### Atualizar temporada atual:

In [None]:
historico_old = pd.read_excel(f'../datasets/xlsx/raw/historico.xlsx')

historico = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  historico.append(match_history(ligas[liga], update= historico_old))

historico = pd.concat(historico)

historico.to_excel(f'datasets/xlsx/raw/historico.xlsx', index= False)

In [None]:
tabela_old = pd.read_excel(f'../datasets/xlsx/raw/tabela.xlsx')

tabela = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  tabela.append(match_history(ligas[liga], update= tabela_old))

tabela = pd.concat(tabela)

tabela.to_excel(f'datasets/xlsx/raw/tabela.xlsx', index= False)

In [None]:
elenco_old = pd.read_excel(f'../datasets/xlsx/raw/elenco.xlsx')

elenco = []
for liga in ligas:
  print(f'Scraping the league: {ligas[liga][2]}')
  elenco.append(get_squads(ligas[liga], update= elenco_old))

elenco = pd.concat(elenco)

elenco.to_excel(f'datasets/xlsx/raw/elenco.xlsx', index= False)