# Automatizing new matches downloads

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from io import StringIO
import time
import re

### Obtaining individual data in the website

In [2]:
class FBREFData:
    def __init__(self, base_url):
        self.base_url = base_url
        self.links = []
        self.gameweeks = []
        self.get_links()
        self.get_gameweek()
        self.create_matches_csv()
    
    def get_links(self):
        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        report_cells = soup.find_all('td', {'data-stat': 'match_report'})

        for idx, cell in enumerate(report_cells):
            link = cell.find('a')
            if link:
                url = link['href']
                self.links.append(f'https://fbref.com{url}')

            # Stop after getting x links
            if len(self.links) >= 3:
                break
            
            time.sleep(6)

        df_links = pd.DataFrame(self.links, columns=['link'])
        df_links.to_csv('links.csv', index=False)
        print(f"Links saved in links.csv")
    
    def get_gameweek(self):
        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        gameweek_cells = soup.find_all('th', {'data-stat': 'gameweek'})

        for idx, cell in enumerate(gameweek_cells):
            gameweek_value = cell.text.strip() if cell else 'N/A'
            self.gameweeks.append(gameweek_value)

            # Stop after getting x gameweeks
            if len(self.gameweeks) >= 4:
                break

            time.sleep(6)
        
        df_gameweeks = pd.DataFrame(self.gameweeks, columns=['gameweek'])

        df_gameweeks.to_csv('gameweeks.csv', index=False)
        print("Gameweeks saved in gameweeks.csv")

        df_gameweeks = df_gameweeks[~df_gameweeks['gameweek'].isin(['Sem.', ''])]

        df_gameweeks.to_csv('gameweeks.csv', index=False)
        
    def create_matches_csv(self):
        df_matches = pd.DataFrame({
            'date_of_match': pd.Series([None] * len(self.links), dtype='object'),  
            'hour_of_the_match': pd.Series([None] * len(self.links), dtype='object'),  
            'home_team_name': pd.Series([None] * len(self.links), dtype='object'),  
            'away_team_name': pd.Series([None] * len(self.links), dtype='object'),
            'home_trainer': pd.Series([None] * len(self.links), dtype='object'),
            'away_trainer': pd.Series([None] * len(self.links), dtype='object'),
            'stadium': pd.Series([None] * len(self.links), dtype='object'),
            'attendance': pd.Series([None] * len(self.links), dtype='object'),
            'referee': pd.Series([None] * len(self.links), dtype='object'),
            'var': pd.Series([None] * len(self.links), dtype='object'),
            'home_team_lineup': pd.Series([None] * len(self.links), dtype='object'),
            'away_team_lineup': pd.Series([None] * len(self.links), dtype='object'),
            ##     'home_team_goal_count': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_goal_count': pd.Series([0] * len(self.links), dtype='int32'),  
            #     'home_possession': pd.Series([0.0] * len(self.links), dtype='float32'),
            #     'away_possession': pd.Series([0.0] * len(self.links), dtype='float32'),  
            ##     'total_goal_count': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'total_goals_at_half_time': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_goal_count_half_time': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_goal_count_half_time': pd.Series([0] * len(self.links), dtype='int32'),
            ##     'home_team_passings': pd.Series([0] * len(self.links), dtype='int32'),
            ##     'away_team_passings': pd.Series([0] * len(self.links), dtype='int32'),
            #     'penaltis': pd.Series([0.0] * len(self.links), dtype='float32'),
            ##     'home_team_passings_success': pd.Series([0] * len(self.links), dtype='int32'),
            ##     'away_team_passings_success': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_shots': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_shots': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_shots_on_target': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_shots_on_target': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_fouls': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_fouls': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_yellow_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_red_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_yellow_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_red_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_first_half_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_second_half_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_first_half_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_second_half_cards': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_shots_off_target': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_shots_off_target': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_team_corner_count': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'away_team_corner_count': pd.Series([0] * len(self.links), dtype='int32'),  
            ##     'home_ppg': pd.Series([0.0] * len(self.links), dtype='float32'),  
            ##     'away_ppg': pd.Series([0.0] * len(self.links), dtype='float32'),
            ##     'home_xg': pd.Series([0.0] * len(self.links), dtype='float32'),  
            ##     'away_xg': pd.Series([0.0] * len(self.links), dtype='float32'),  
            ##     'match_result': pd.Series([0.0] * len(self.links), dtype='float32'),
        })
        
        df_matches.to_csv('matches.csv', index=False)

    def get_statistics(self):
        try:
            df_links = pd.read_csv('links.csv')
            links = df_links['link'].tolist()

            df_matches = pd.read_csv('matches.csv')

            for idx, link in enumerate(links):
                try:
                    print(f"Processing link {idx + 1}: {link}")
                    response = requests.get(link)
                    soup = BeautifulSoup(response.text, 'html.parser')

                    match_date = "Unknown"
                    match_time = "Unknown"
                    home_team_name = "Unknown"
                    away_team_name = "Unknown"
                    home_trainer = "Unknown"
                    away_trainer = "Unknown"
                    stadium = "Unknown"
                    attendance = "Unknown"
                    referee = "Unknown"
                    var = "Unknown"
                    home_team_lineup = "Unknown"
                    away_team_lineup = "Unknown"

                    # date_of_match
                    date_element = soup.find('span', {'class': 'venuetime'})
                    if date_element:
                        match_date = date_element.get('data-venue-date', "Unknown")
                        match_time = date_element.get('data-venue-time', "Unknown")
                    df_matches.at[idx, 'date_of_match'] = match_date
                    df_matches.at[idx, 'hour_of_the_match'] = match_time

                    # home_team_name and away_team_name
                    teams_elements = soup.find_all('span', class_='teamandlogo')
                    if len(teams_elements) >= 2:
                        home_team_name = teams_elements[0].text.strip()
                        away_team_name = teams_elements[1].text.strip()
                    df_matches.at[idx, 'home_team_name'] = home_team_name
                    df_matches.at[idx, 'away_team_name'] = away_team_name

                    # home_trainer and away_trainer
                    trainers_elements = soup.find_all('div', class_='datapoint')
                    trainer_count = 0
                    for trainer_element in trainers_elements:
                        if 'Director Técnico' in trainer_element.text:
                            trainer_name = trainer_element.text.split(':')[-1].strip()
                            if trainer_count == 0:
                                home_trainer = trainer_name
                                trainer_count += 1
                            elif trainer_count == 1:
                                away_trainer = trainer_name
                                trainer_count += 1
                    df_matches.at[idx, 'home_trainer'] = home_trainer
                    df_matches.at[idx, 'away_trainer'] = away_trainer

                    # stadium
                    stadium_element = soup.find('div', class_='scorebox_meta')
                    if stadium_element:
                        stadium_info = stadium_element.find('strong', string="Sedes")
                        if stadium_info:
                            stadium = stadium_info.find_next('small').find_next('small').text.strip()
                    df_matches.at[idx, 'stadium'] = stadium

                    # attendance
                    attendance_element = soup.find('div', class_='scorebox_meta')
                    if attendance_element:
                        attendance_info = attendance_element.find('strong', string="Asistencia")
                        if attendance_info:
                            attendance = attendance_info.find_next('small').find_next('small').text.strip()
                    df_matches.at[idx, 'attendance'] = attendance

                    # referee
                    referee_element = soup.find('div', class_='scorebox_meta')
                    if referee_element:
                        referee_info = referee_element.find_next('strong', string="Autoridades")
                        if referee_info:
                            referee_span = referee_info.find_next('small').find_next('small').find('span', style="display:inline-block")
                            if referee_span:
                                referee = referee_span.text.strip()
                    df_matches.at[idx, 'referee'] = referee

                    # var
                    var_element = soup.find('div', class_='scorebox_meta')
                    if var_element:
                        var_info = var_element.find_next('strong', string="Autoridades")
                        if var_info:
                            var_span = var_info.find_next('small').find_next('small').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span')
                            if var_span:
                                var = var_span.text.strip()
                    df_matches.at[idx, 'var'] = var

                    # lineup
                    lineup_elements = soup.find_all('th', string=lambda text: text and '(' in text and ')' in text)
                    if len(lineup_elements) >= 1:
                        home_match = re.search(r'\((.*?)\)', lineup_elements[0].text)
                        if home_match:
                            home_team_lineup = home_match.group(1)
                    if len(lineup_elements) >= 2:
                        away_match = re.search(r'\((.*?)\)', lineup_elements[1].text)
                        if away_match:
                            away_team_lineup = away_match.group(1)
                    df_matches.at[idx, 'home_team_lineup'] = home_team_lineup
                    df_matches.at[idx, 'away_team_lineup'] = away_team_lineup

                except Exception as e:
                    print(f"Error processing link {link}: {e}")

                time.sleep(6)

            df_matches.to_csv('matches.csv', index=False)
            print("File 'matches.csv' updated successfully.")

        except FileNotFoundError as e:
            print(f"Error: {e}. Make sure 'links.csv' and 'matches.csv' exist.")
        except Exception as e:
            print(f"Unexpected error: {e}")

### Obtaining global data in the website

In [3]:
class MultiTableExtractor:
    def __init__(self, url, gameweek):
        self.url = url
        self.gameweek = gameweek
        self.soup = None
        self.teams_data = {}
        self.fetch_page()

    def fetch_page(self):
        response = requests.get(self.url)
        if response.status_code != 200:
            raise Exception(f"Error al acceder a la página: {response.status_code}")
        self.soup = BeautifulSoup(response.content, "html.parser")
        time.sleep(6)

    def extract_teams_ids(self):
        if not self.soup:
            raise Exception("Primero debes cargar la página con `fetch_page`.")
        
        team_imgs = self.soup.find_all('img', class_='teamlogo', src=True)
        if len(team_imgs) >= 2:
            self.teams_data = {
                "home": {
                    "id": team_imgs[0]['src'].split('/')[-1].split('.')[0],
                    "name": team_imgs[0]['alt'].split(" ")[0],
                },
                "away": {
                    "id": team_imgs[1]['src'].split('/')[-1].split('.')[0],
                    "name": team_imgs[1]['alt'].split(" ")[0],
                },
            }
        else:
            raise Exception("No se encontraron suficientes logos de equipos.")

    def extract_table(self, team_type, table_type, header_offset, columns_to_drop):
        team_id = self.teams_data[team_type]["id"]
        table_selector = f"#div_stats_{team_id}_{table_type}"
        table = self.soup.select_one(table_selector)
        if not table:
            raise Exception(f"No se encontró la tabla {table_type} para el equipo {team_type}.")
        
        headers = [th.text.strip() for th in table.find("thead").find_all("th")][header_offset:]
        rows = [
            [cell.text.strip() for cell in row.find_all(["td", "th"])]
            for row in table.find("tbody").find_all("tr")
        ]
        df = pd.DataFrame(rows, columns=headers)
        df = df.loc[:, ~df.columns.isin(columns_to_drop)]
        return df

    def process_team_data(self, team_type):
        # Columnas que se eliminarán para cada tipo de tabla
        columns_to_drop = {
            "summary": ['Ass', 'TP', 'TPint', 'TA', 'TR', 'Toques', 'Tkl', 'Int', 'Bloqueos', 'xG', 'npxG', 'xAG', 'ACT', 'ACG', 'Cmp', 'Int.', '% Cmp', 'PrgP', 'Transportes', 'PrgC', 'Att', 'Succ'],
            "passing": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín'],
            "passing_types": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Int.', 'Cmp'],
            "defense": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín'],
            "possession": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Tkld', 'Tkld%'],
            "misc": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Pcz', 'PA', 'Int', 'TklG', 'GC'],
        }

        # Inicializar tablas
        tables = {}
        for table_type, header_offset in [("summary", 7), ("passing", 9), ("passing_types", 4), ("defense", 5), ("possession", 5), ("misc", 3)]:
            tables[table_type] = self.extract_table(
                team_type, table_type, header_offset, columns_to_drop.get(table_type, [])
            )
            
        return tables

    def save_team_tables(self):
        self.extract_teams_ids()
        for team_type in ["home", "away"]:
            team_name = self.teams_data[team_type]["name"]
            team_tables = self.process_team_data(team_type)
            final_table = pd.concat(team_tables.values(), axis=1)
            final_table.to_csv(f"{self.gameweek}_{team_name}_{team_type}_team_final_table.csv", index=False)

            new_columns = ["Player", "Number", "Nationality", "Position", "Age",
                  "Minutes", "Goals", "Shots", "ShotsOnTarget", "CompletedPasses", "AttemptedPasses", 
                  "%CompletedPasses", "DistancePasses", "DistanceProgression", 
                  "ShortPasses", "AttemptedShortPasses", "%ShortCompletedPasses", 
                  "MediumPasses", "AttemptedMediumPasses", "%MediumCompletedPasses", 
                  "LongPasses", "AttemptedLongPasses", "%LlongCompletedPasses", 
                  "Assistance", "ExpectedGoalsAssistance", "ExpectedAssistance", 
                  "KeyPasses", "Last1/3Passes", "GoalAreaPasses", "GoalAreaCrosses", 
                  "GoalPasses", "LiveBallPasses", "DeadBallPasses", "FreeKick", 
                  "LongPasses", "SidePasses", "Crosses", "Strongcrosses", "Corner", 
                  "CornerIn", "CornerOut", "CornerRect", "OffsidePasses", 
                  "PassesBlocked", "Tackles", "SuccessfulTackles", "TacklesInDefense", 
                  "TacklesInMedium", "TacklesInAttack", "DribblerTackles", 
                  "AttemptedDribblerTackles", "%DribblerTacklesCompleted", 
                  "DribblerTacklesNonCompleted", "BallsBlocked", "ShotsBlocked", 
                  "PassesBlocked", "Interceptions", "Tackles+Interceptions", 
                  "Clearances", "MistakesRivalShots", "Touches", 
                  "OwnPenaltyAreaTouches", "TouchesInDefense", "TouchesInMedium",
                  "AttemptedDribbles" , "DribblesCompleted", "%DribblesCompleted",
                  "TouchesInAttack", "AwayPenaltyAreaTouches", "LiveBallTouches", 
                  "BallCarries","DistanceCarried", "ForwardDistanceCarried", 
                  "ForwardCarries", "CarriesInAttack", "AwayPenaltyAreaCarries",
                  "LostControlCarries", "LostCarries", "PassesReception", 
                  "AttackPassesReception", "YellowCards", "RedCards", 
                  "SecondYellowCards", "Fouls", "Offside", "Penalties",
                  "PenaltiesConceded", "LostBallRecoveries", "AerialsWon", 
                  "AerialsLost", "%AerialsWon"
                  ]

            # Cambiar los nombres de las columnas del 0 al 89
            final_table.columns = new_columns

            # Guardar el CSV actualizado
            output_filename = f"{self.gameweek}_{team_name}_{team_type}_team_final_table.csv"
            final_table.to_csv(output_filename, index=False)

            print(f"Tabla del equipo {team_type} guardada como {output_filename}")
    
    def process_links_and_gameweeks(self, links_file, gameweeks_file):
        links_df = pd.read_csv(links_file)
        gameweeks_df = pd.read_csv(gameweeks_file)
        
        for index, link in enumerate(links_df['link']):
            gameweek = gameweeks_df.iloc[index]['gameweek']
            print(f"Procesando URL: {link} con Gameweek: {gameweek}")
            extractor = MultiTableExtractor(link, gameweek)
            extractor.save_team_tables()


### Full process

In [18]:
'''base_url = 'https://fbref.com/es/comps/12/2023-2024/horario/Marcadores-y-partidos-de-2023-2024-La-Liga'

extractor = MultiTableExtractor(base_url, None)
extractor.process_links_and_gameweeks('links.csv', 'gameweeks.csv')'''

Procesando URL: https://fbref.com/es/partidos/30d3212e/Almeria-Rayo-Vallecano-Agosto-11-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Almería_home_team_final_table.csv
Tabla del equipo away guardada como 1_Rayo_away_team_final_table.csv
Procesando URL: https://fbref.com/es/partidos/525758cc/Sevilla-Valencia-Agosto-11-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Sevilla_home_team_final_table.csv
Tabla del equipo away guardada como 1_Valencia_away_team_final_table.csv
Procesando URL: https://fbref.com/es/partidos/706596c6/Real-Sociedad-Girona-Agosto-12-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Real_home_team_final_table.csv
Tabla del equipo away guardada como 1_Girona_away_team_final_table.csv


In [4]:
def run_full_process():
    base_url = 'https://fbref.com/es/comps/12/2023-2024/horario/Marcadores-y-partidos-de-2023-2024-La-Liga'
    
    fbref_data_instance = FBREFData(base_url)
    fbref_data_instance.get_statistics()
    
    extractor = MultiTableExtractor(base_url, None)
    extractor.process_links_and_gameweeks('links.csv', 'gameweeks.csv')


In [5]:
run_full_process()

Links saved in links.csv
Gameweeks saved in gameweeks.csv
Processing link 1: https://fbref.com/es/partidos/30d3212e/Almeria-Rayo-Vallecano-Agosto-11-2023-La-Liga


  df_matches.at[idx, 'date_of_match'] = match_date
  df_matches.at[idx, 'hour_of_the_match'] = match_time
  df_matches.at[idx, 'home_team_name'] = home_team_name
  df_matches.at[idx, 'away_team_name'] = away_team_name
  df_matches.at[idx, 'home_trainer'] = home_trainer
  df_matches.at[idx, 'away_trainer'] = away_trainer
  df_matches.at[idx, 'stadium'] = stadium
  df_matches.at[idx, 'attendance'] = attendance
  df_matches.at[idx, 'referee'] = referee
  df_matches.at[idx, 'var'] = var
  df_matches.at[idx, 'home_team_lineup'] = home_team_lineup
  df_matches.at[idx, 'away_team_lineup'] = away_team_lineup


Processing link 2: https://fbref.com/es/partidos/525758cc/Sevilla-Valencia-Agosto-11-2023-La-Liga
Processing link 3: https://fbref.com/es/partidos/706596c6/Real-Sociedad-Girona-Agosto-12-2023-La-Liga
File 'matches.csv' updated successfully.
Procesando URL: https://fbref.com/es/partidos/30d3212e/Almeria-Rayo-Vallecano-Agosto-11-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Almería_home_team_final_table.csv
Tabla del equipo away guardada como 1_Rayo_away_team_final_table.csv
Procesando URL: https://fbref.com/es/partidos/525758cc/Sevilla-Valencia-Agosto-11-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Sevilla_home_team_final_table.csv
Tabla del equipo away guardada como 1_Valencia_away_team_final_table.csv
Procesando URL: https://fbref.com/es/partidos/706596c6/Real-Sociedad-Girona-Agosto-12-2023-La-Liga con Gameweek: 1
Tabla del equipo home guardada como 1_Real_home_team_final_table.csv
Tabla del equipo away guardada como 1_Girona_away_team_final_

### Funcion para ver cuanto tiempo de espera

In [25]:
import requests
import time
from bs4 import BeautifulSoup

def fetch_page_with_retry(url):
    # Intentamos hacer la solicitud
    response = requests.get(url)

    # Si el código de estado es 429 (Too Many Requests), esperamos
    if response.status_code == 429:
        retry_after = response.headers.get('Retry-After', 60)  # Si no hay "Retry-After", esperamos 60 segundos
        print(f"Demasiadas solicitudes. Espera {retry_after} segundos antes de intentar de nuevo.")
        time.sleep(int(retry_after))  # Esperamos el tiempo indicado
        return fetch_page_with_retry(url)  # Hacemos la solicitud nuevamente después de esperar

    # Si la solicitud es exitosa (200 OK), procesamos la página
    if response.status_code == 200:
        return response.text  # Devolvemos el contenido de la página

    # Si no es un código 200 o 429, retornamos None
    print(f"Error al acceder a la página: {response.status_code}")
    return None

def extract_match_reports(url):
    page_content = fetch_page_with_retry(url)
    
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        informes = soup.find_all('td', {'data-stat': 'match_report'})
        enlaces = ['https://fbref.com' + informe.find('a')['href'] for informe in informes if informe.find('a')]
        
        for enlace in enlaces:
            print(enlace)
    else:
        print("No se pudo obtener el contenido de la página.")

# URL de ejemplo
url = 'https://fbref.com/es/comps/12/2023-2024/horario/Marcadores-y-partidos-de-2023-2024-La-Liga'

# Llamamos a la función
extract_match_reports(url)


https://fbref.com/es/partidos/30d3212e/Almeria-Rayo-Vallecano-Agosto-11-2023-La-Liga
https://fbref.com/es/partidos/525758cc/Sevilla-Valencia-Agosto-11-2023-La-Liga
https://fbref.com/es/partidos/706596c6/Real-Sociedad-Girona-Agosto-12-2023-La-Liga
https://fbref.com/es/partidos/8c6555db/Las-Palmas-Mallorca-Agosto-12-2023-La-Liga
https://fbref.com/es/partidos/c31f0a31/Athletic-Club-Real-Madrid-Agosto-12-2023-La-Liga
https://fbref.com/es/partidos/8689fb39/Celta-Vigo-Osasuna-Agosto-13-2023-La-Liga
https://fbref.com/es/partidos/b8258a85/Villarreal-Real-Betis-Agosto-13-2023-La-Liga
https://fbref.com/es/partidos/b34ff242/Getafe-Barcelona-Agosto-13-2023-La-Liga
https://fbref.com/es/partidos/c36e6859/Cadiz-Alaves-Agosto-14-2023-La-Liga
https://fbref.com/es/partidos/a96e4e26/Atletico-Madrid-Granada-Agosto-14-2023-La-Liga
https://fbref.com/es/partidos/d447f323/Mallorca-Villarreal-Agosto-18-2023-La-Liga
https://fbref.com/es/partidos/8766ab05/Valencia-Las-Palmas-Agosto-18-2023-La-Liga
https://fbref.