In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from pathlib import Path
import time
import warnings
import re
from pathlib import Path
warnings.filterwarnings("ignore")

### Ver nombres de equipos

In [2]:

# Nombres de archivos de salida
final_nongk = 'stats_big5_25_26.csv'
final_gk = 'stats_big5_gk_25_26.csv'
final_teams = 'stats_big5_teams_25_26.csv'

# Ruta al directorio "data/big5_fbref" desde "notebooks/"
root = Path.cwd().parent / "data" / "season"

# Mapa para renombrar columnas
column_map = {
    # Columnas comunes
    'Player': 'Jugador',
    'Nation': 'Nacionalidad',
    'Pos': 'Posicion',
    'Squad': 'Equipo',
    'Comp': 'Competicion',
    'Age': 'Edad',
    'Born': 'Nacimiento',
    'Min': 'Minutos',
    'MP': 'Partidos',
    'Starts': 'Titular',
    'Main Position': 'Posicion_princ',
    'Position2': 'Posicion_2',
    
    # Estadísticas ofensivas
    'Goals': 'Goles',
    'Assists': 'Asistencias',
    'G+A': 'Goles+Asistencias',
    'npGoals': 'Goles_sin_penal',
    'PK': 'Penaltis',
    'PKatt': 'Penaltis_intentados',
    'xG': 'xG',
    'npxG': 'xG_sin_penal',
    'xAG': 'xAG',
    'xA': 'xA',
    'Shots': 'Disparos',
    'SoT': 'Disparos_porteria',
    'FKShots': 'Tiros_falta',
    'PK': 'Penaltis',
    'PKsAtt': 'Penaltis_intentados',

    
    # Estadísticas de pases
    'PassesCompleted': 'Pases_completados',
    'PassesAttempted': 'Pases_intentados',
    'KeyPasses': 'Pases_clave',
    'TotCmp%': 'Pct_pases',
    'ProgPasses': 'Pases_progresivos',
    'KeyPasses': 'Pases_clave',
    'Final1/3Cmp': 'Pases_ultimo_tercio',
    'PenAreaCmp': 'Pases_area',
    'ThruBalls': 'Pases_filtrados',
    'Switches': 'Cambios_orientacion',
    'Crs': 'Centros',
    'CK': 'Corners',
    'InSwingCK': 'Corner_cerrado',
    'OutSwingCK': 'Corner_abierto',
    'StrCK': 'Corner_recto',
    'PassesToOff': 'Pases_a_fuera_de_juego',
    'PassesBlocked': 'Pases_bloqueados',
    'SCA': 'Acc_llevan_a_tiro',
    'SCASh': 'Disparo_mas_disparo',
    'SCAFld': 'Faltas_recib_mas_disparo',
    'SCADef': 'Acc_defensivas_mas_disparo',
    'GCA': 'Acc_llevan_a_gol',

    
    # Estadísticas defensivas
    'Tkl': 'Entradas',
    'TklWinPoss': 'Entradas_canadas',
    'Int': 'Intercepciones',
    'Tkl+Int': 'Entradas+Intercepciones',
    'Clr': 'Despejes',
    'Blocks': 'Bloqueos',
    'ShBlocks': 'Bloqueos_cisparo',
    
    # Estadísticas de posesión
    'Touches': 'Toques',
    'Carries': 'Conducciones',
    'ProgCarries': 'Conducciones_progresivas',
    'SuccDrb': 'Regates_exitosos',
    'AttDrb': 'Regates_intentados',
    
    # Métricas ajustadas por posesión
    'pAdjTkl+IntPer90': 'pAdj_Entradas+Intercepciones_90min',
    'pAdjIntPer90': 'pAdj_Intercepciones_90min',
    'TouchCentrality': 'Centralidad_Toques',
    
    # Estadísticas de portero
    'GA': 'Goles_recibidos',
    'GA90': 'Goles_recibidos_90min',
    'SoTA': 'Disparos_recibidos_porteria',
    'Saves': 'Paradas',
    'Save%': 'Pct_paradas',
    'CS': 'Porterías_cero',
    'PSxG': 'PSxG',
    'PSxG+/-': 'PSxG+/-',
    '#OPA': 'Acciones_fuera_area',
    
    # Otras estadísticas
    'Yellows': 'Tarjetas_amarillas',
    'Yellow2': 'Segunda_amarilla',
    'Reds': 'Tarjetas_rojas',
    'Fls': 'Faltas',
    'Fld': 'Faltas_cometidas',
    'Off': 'Fueras_de_juego',
    'OG': 'Goles_pp',
    'PKwon': 'Penal_provocado',
    'PKcon': 'Penal_cometido',
    'AerialWins': 'Duelos_aereos_ganados',
    'AerialLoss': 'Duelos_aereos_perdidos',
    'AerialWin%': 'Pct_duelos_aereos',
    'AvgTeamPoss': 'Posesion_media_equipo',
}

# Diccionario para corregir nombres de equipos
reemplazos_equipos = {
    'Alavés': 'Alaves',  'Aston Villa': 'Villa', 'Athletic Club': 'Athletic', 'Atlético Madrid': 'Atleti',  'Celta Vigo': 'Celta',
    'Espanyol': 'Espanyol', 'Manchester City': 'City', 'Manchester United': 'United', 'Marseille': 'Marsella', 'Newcastle Utd': 'Newcastle', "Nott'ham Forest": 'Forest',
    'Paris FC': 'Paris', 'Paris S-G': 'PSG', 'Rayo Vallecano': 'Rayo', 'Real Sociedad': 'Real', 'West Ham': 'West Ham', 'Real Madrid': 'Madrid', 'Real Betis': 'Betis',
    'Crystal Palace': 'Palace', 'Leeds United': 'Leeds'
}

In [3]:
# Funciones de web scraping
def _get_table(soup):
    return soup.find_all('table')[0]

def _get_opp_table(soup):
    return soup.find_all('table')[1]

def _parse_row(row):
    cols = None
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    return cols

def get_df(path):
    URL = path
    time.sleep(4)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    table = _get_table(soup)
    data = []
    headings=[]
    headtext = soup.find_all("th",scope="col")
    for i in range(len(headtext)):
        heading = headtext[i].get_text()
        headings.append(heading)
    headings=headings[1:len(headings)]
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')

    for row_index in range(len(rows)):
        row = rows[row_index]
        cols = _parse_row(row)
        data.append(cols)
    
    data = pd.DataFrame(data)
    data = data.rename(columns=data.iloc[0])
    data = data.reindex(data.index.drop(0))
    data = data.replace('',0)
    return data

def get_opp_df(path):
    URL = path
    time.sleep(4)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    table = _get_opp_table(soup)
    data = []
    headings=[]
    headtext = soup.find_all("th",scope="col")
    for i in range(len(headtext)):
        heading = headtext[i].get_text()
        headings.append(heading)
    headings=headings[1:len(headings)]
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')

    for row_index in range(len(rows)):
        row = rows[row_index]
        cols = _parse_row(row)
        data.append(cols)
    
    data = pd.DataFrame(data)
    data = data.rename(columns=data.iloc[0])
    data = data.reindex(data.index.drop(0))
    data = data.replace('',0)
    return data

# Función de limpieza inicial
def limpieza_inicial(df, is_team_df=False):
    print("Realizando limpieza inicial...")
    df = df.copy()
    
    # Eliminar duplicados (solo para jugadores, no para equipos)
    if not is_team_df and 'Player' in df.columns and 'Squad' in df.columns:
        if 'Nation' in df.columns:
            df = df.drop_duplicates(subset=['Player', 'Nation', 'Squad']).reset_index(drop=True)
        else:
            df = df.drop_duplicates(subset=['Player', 'Squad']).reset_index(drop=True)
    
    # Gestión de nulos
    df = df.fillna(0)
    
    return df

# Función para transformar columnas
def transformar_columnas(df):
    print("Transformando columnas...")
    df = df.copy()
    
    # 1. Arreglar Nacionalidad: extraer solo las 3 últimas letras
    if 'Nation' in df.columns:
        df['Nation'] = df['Nation'].astype(str).apply(lambda x: x[-3:] if len(str(x)) >= 3 else x)
    
    # 2. Eliminar columna Country si existe (es redundante)
    if 'Country' in df.columns:
        df = df.drop(columns=['Country'])
    
    # 3. Separar Posición en principal y secundaria
    if 'Pos' in df.columns:
        # Crear columna de segunda posición
        df['Position2'] = df['Pos'].apply(
            lambda x: x.split(',')[1].strip() if isinstance(x, str) and ',' in x and len(x.split(',')) > 1 else ''
        )
        
        # Modificar Pos para quedarnos solo con la primera posición
        df['Pos'] = df['Pos'].apply(
            lambda x: x.split(',')[0].strip() if isinstance(x, str) and ',' in x else x
        )
        
        # Usar Pos como Main Position si no existe o está vacía
        if 'Main Position' not in df.columns:
            df['Main Position'] = df['Pos']
        else:
            # Rellenar Main Position vacía con Pos
            mask = (df['Main Position'].isna()) | (df['Main Position'] == '') | (df['Main Position'] == 0)
            df.loc[mask, 'Main Position'] = df.loc[mask, 'Pos']
    
    # 4. Extraer competición (quitar prefijo de idioma)
    if 'Comp' in df.columns:
        df['Comp'] = df['Comp'].astype(str).apply(lambda x: re.sub(r'^(eng|es|it|fr|de)\s+', '', x))
    
    # 5. Extraer edad (primeros 2 caracteres)
    if 'Age' in df.columns:
        for i in range(len(df)):
            if isinstance(df['Age'].iloc[i], str) and len(df['Age'].iloc[i]) >= 2:
                df['Age'].iloc[i] = df['Age'].iloc[i][:2]
    
    return df

# Función para corregir nombres de equipos
def corregir_equipos(df):
    print("Corrigiendo nombres de equipos...")
    df = df.copy()
    df['Squad'] = df['Squad'].replace(reemplazos_equipos)
    return df

# Función para limpiar posiciones
def limpiar_posiciones(df):
    print("Limpiando posiciones...")
    df = df.copy()
    
    # Si el dataframe tiene las columnas Main Position y Position2
    if 'Main Position' in df.columns and 'Position2' in df.columns:
        # Rellenar valores vacíos en Position2 con Main Position
        df['Position2'] = df.apply(
            lambda row: row['Main Position'] if pd.isna(row['Position2']) or row['Position2'] == '' else row['Position2'], 
            axis=1
        )
        
        # Rellenar valores vacíos en Main Position con la primera posición de Pos
        mask = (df['Main Position'].isna()) | (df['Main Position'] == '')
        for i in df[mask].index:
            if pd.notna(df.loc[i, 'Pos']) and df.loc[i, 'Pos'] != '':
                pos = str(df.loc[i, 'Pos']).split(',')[0].strip()
                pos_map = {'DF': 'Centre-Back', 'MF': 'Central Midfield', 'FW': 'Centre-Forward', 'GK': 'Goalkeeper'}
                df.loc[i, 'Main Position'] = pos_map.get(pos, pos)
    
    return df

# Función para corregir datos específicos
def corregir_datos_especificos(df):
    print("Corrigiendo datos específicos de jugadores...")
    df = df.copy()
    
    # Corregir nacionalidades
    correciones_nac = {
        'Atakan Karazor': 'TUR',
        'Plamedi Nsingi': 'FRA',
        'Fer López': 'ESP'
    }
    
    # Corregir años de nacimiento
    correcciones_nac = {
        'Hannes Behrens': '2005',
        'Max Moerstedt': '2006',
        'Fer López': '2004',
        'Pape Daouda Diongue': '2006'
    }
    
    # Corregir edades
    correcciones_edad = {
        'Hannes Behrens': '19',
        'Max Moerstedt': '18',
        'Fer López': '20',
        'Pape Daouda Diongue': '18'
    }
    
    # Aplicar correcciones si la columna Player existe
    if 'Player' in df.columns:
        for jugador, nac in correciones_nac.items():
            df.loc[df['Player'] == jugador, 'Nation'] = nac
            
        for jugador, nac in correcciones_nac.items():
            df.loc[df['Player'] == jugador, 'Born'] = nac
            
        for jugador, edad in correcciones_edad.items():
            df.loc[df['Player'] == jugador, 'Age'] = edad
    
    return df

# Función para categorizar columnas
def categorizar_columnas(df):
    print("Categorizando columnas...")
    df = df.copy()
    
    # Definir columnas categóricas (ajustar según los nombres en tu DataFrame)
    categoricas = ['Player', 'Nation', 'Country', 'Pos', 'Squad', 'Comp', 'Born', 'Main Position', 'Position2']
    
    # Eliminar columnas duplicadas
    df = df.loc[:, ~df.columns.duplicated()]
    
    # Convertir columnas numéricas
    for columna in df.columns:
        if columna not in categoricas and columna != 'Min':
            try:
                df[columna] = pd.to_numeric(df[columna], errors='coerce')
            except:
                print(f"No se pudo convertir la columna {columna} a numérica")
    
    # Limpiar y convertir Min
    try:
        if 'Min' in df.columns:
            df['Min'] = df['Min'].astype(str).str.replace(',', '').astype(float)
    except:
        print("Error al convertir la columna Min")
    
    return df

# Función para aplicar todas las transformaciones
def procesar_dataframe(df, is_team_df=False):
    if not is_team_df:
        # Procesamiento para dataframes de jugadores
        df = limpieza_inicial(df)
        df = transformar_columnas(df)
        df = corregir_equipos(df)
        df = limpiar_posiciones(df)
        df = corregir_datos_especificos(df)
    else:
        # Procesamiento específico para dataframe de equipos
        df = df.copy()
        df = df.fillna(0)  # Solo gestión de nulos para equipos
        df = corregir_equipos(df)  # Corregir nombres de equipos
    
    df = categorizar_columnas(df)
    
    # Redondear valores numéricos a 2 decimales
    for col in df.columns:
        if col not in ['Player', 'Nation', 'Country', 'Pos', 'Squad', 'Comp', 'Born', 'Main Position', 'Position2'] and df[col].dtype in ['float64', 'float32']:
            df[col] = df[col].round(2)
    
    # Renombrar columnas según el mapa
    rename_cols = {}
    for old_name, new_name in column_map.items():
        if old_name in df.columns:
            rename_cols[old_name] = new_name
    
    df = df.rename(columns=rename_cols)
    
    return df

# Función principal de extracción de datos
def extraer_datos_fbref():
    print("Iniciando extracción de datos de FBRef...")
    
    # URLs de las tablas
    standard = "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats"
    shooting = "https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats"
    passing = "https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats"
    pass_types = "https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats"
    gsca = "https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats"
    defense = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
    poss = "https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats"
    misc = "https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats"
    
    # Obtener los dataframes
    print("Obteniendo datos de jugadores...")
    df_standard = get_df(standard)
    df_shooting = get_df(shooting)
    df_passing = get_df(passing)
    df_pass_types = get_df(pass_types)
    df_gsca = get_df(gsca)
    df_defense = get_df(defense)
    df_poss = get_df(poss)
    df_misc = get_df(misc)
    
    # Ordenar y resetear índices
    for df in [df_standard, df_shooting, df_passing, df_pass_types, df_gsca, df_defense, df_poss, df_misc]:
        df.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
        df.reset_index(drop=True, inplace=True)
    
    # Unir todos los dataframes
    df = df_standard.iloc[:, 0:10]
    df = df.join(df_standard.iloc[:, 13])
    df = df.join(df_standard.iloc[:, 26])
    df = df.rename(columns={'G-PK': 'npGoals', 'Gls':'Glsxx'})
    df = df.join(df_shooting.iloc[:,8:25])
    df = df.rename(columns={'Gls': 'Goals', 'Sh': 'Shots', 'SoT': 'SoT', 'SoT%': 'SoT%', 'Sh/90': 'Sh/90', 'SoT/90': 'SoT/90', 'G/Sh': 'G/Sh', 'G/SoT': 'G/SoT', 'Dist': 'AvgShotDistance', 'FK': 'FKShots', 'PK': 'PK', 'PKatt': 'PKatt', 'xG': 'xG', 'npxG': 'npxG', 'npxG/Sh': 'npxG/Sh', 'G-xG': 'G-xG', 'np:G-xG': 'npG-xG'})

    df = df.join(df_passing.iloc[:,8:13])
    df = df.rename(columns={'Cmp': 'PassesCompleted', 'Att': 'PassesAttempted', 'Cmp%': 'TotCmp%', 'TotDist': 'TotalPassDist', 'PrgDist': 'ProgPassDist', })
    df = df.join(df_passing.iloc[:,13:16])
    df = df.rename(columns={'Cmp': 'ShortPassCmp', 'Att': 'ShortPassAtt', 'Cmp%': 'ShortPassCmp%', })
    df = df.join(df_passing.iloc[:,16:19])
    df = df.rename(columns={'Cmp': 'MedPassCmp', 'Att': 'MedPassAtt', 'Cmp%': 'MedPassCmp%', })
    df = df.join(df_passing.iloc[:,19:22])
    df = df.rename(columns={'Cmp': 'LongPassCmp', 'Att': 'LongPassAtt', 'Cmp%': 'LongPassCmp%', })
    df = df.join(df_passing.iloc[:,22:31])
    df = df.rename(columns={'Ast': 'Assists', 'xAG':'xAG', 'xA': 'xA', 'A-xAG': 'A-xAG', 'KP': 'KeyPasses', '1/3': 'Final1/3Cmp', 'PPA': 'PenAreaCmp', 'CrsPA': 'CrsPenAreaCmp', 'PrgP': 'ProgPasses', })

    df = df.join(df_pass_types.iloc[:, 9:23])
    df = df.rename(columns={'Live': 'LivePass', 'Dead': 'DeadPass', 'FK': 'FKPasses', 'TB': 'ThruBalls', 'Sw': 'Switches', 'Crs': 'Crs', 'CK': 'CK', 'In': 'InSwingCK', 'Out': 'OutSwingCK', 'Str': 'StrCK', 'TI': 'ThrowIn', 'Off': 'PassesToOff', 'Blocks':'PassesBlocked', 'Cmp':'Cmpxxx'})

    df = df.join(df_gsca.iloc[:, 8:16].rename(columns={'SCA': 'SCA', 'SCA90': 'SCA90', 'PassLive': 'SCAPassLive', 'PassDead': 'SCAPassDead', 'TO': 'SCADrib', 'Sh': 'SCASh', 'Fld': 'SCAFld', 'Def': 'SCADef'}))
    df = df.join(df_gsca.iloc[:, 16:24].rename(columns={'GCA': 'GCA', 'GCA90': 'GCA90', 'PassLive': 'GCAPassLive', 'PassDead': 'GCAPassDead', 'TO': 'GCADrib', 'Sh': 'GCASh', 'Fld': 'GCAFld', 'Def': 'GCADef'}))

    df = df.join(df_defense.iloc[:,8:13].rename(columns={'Tkl': 'Tkl', 'TklW': 'TklWinPoss', 'Def 3rd': 'Def3rdTkl', 'Mid 3rd': 'Mid3rdTkl', 'Att 3rd': 'Att3rdTkl'}))
    df = df.join(df_defense.iloc[:,13:24].rename(columns={'Tkl': 'DrbTkl', 'Att': 'DrbPastAtt', 'Tkl%': 'DrbTkl%', 'Lost': 'DrbPast', 'Blocks': 'Blocks', 'Sh': 'ShBlocks', 'Pass': 'PassBlocks', 'Int': 'Int', 'Tkl+Int': 'Tkl+Int', 'Clr': 'Clr', 'Err': 'Err'}))

    df = df.join(df_poss.iloc[:,8:30])
    df = df.rename(columns={'Touches': 'Touches', 'Def Pen': 'DefPenTouch', 'Def 3rd': 'Def3rdTouch', 'Mid 3rd': 'Mid3rdTouch', 'Att 3rd': 'Att3rdTouch', 'Att Pen': 'AttPenTouch', 'Live': 'LiveTouch', 'Succ': 'SuccDrb', 'Att': 'AttDrb', 'Succ%': 'DrbSucc%', 'Tkld':'TimesTackled', 'Tkld%':'TimesTackled%', 'Carries':'Carries', 'TotDist':'TotalCarryDistance', 'PrgDist':'ProgCarryDistance', 'PrgC':'ProgCarries', '1/3':'CarriesToFinalThird', 'CPA':'CarriesToPenArea', 'Mis': 'CarryMistakes', 'Dis': 'Disposesed', 'Rec': 'ReceivedPass', 'PrgR':'ProgPassesRec'})

    df = df.join(df_misc.iloc[:, 8:14])
    df = df.rename(columns={'CrdY': 'Yellows', 'CrdR': 'Reds', '2CrdY': 'Yellow2', 'Fls': 'Fls', 'Fld': 'Fld', 'Off': 'Off', })
    df = df.join(df_misc.iloc[:,17:24])
    df = df.rename(columns={'PKwon': 'PKwon', 'PKcon': 'PKcon', 'OG': 'OG', 'Recov': 'Recov', 'Won': 'AerialWins', 'Lost': 'AerialLoss', 'Won%': 'AerialWin%', })

    # Eliminar filas vacías
    df.dropna(subset=["Player"], inplace=True)
    
    # Limpiar Min (quitar comas)
    for i in range(0, len(df)):
        df.iloc[i][9] = df.iloc[i][9].replace(',', '')
    df.iloc[:, 9:] = df.iloc[:, 9:].apply(pd.to_numeric, errors='coerce')
    
    # Obtener datos de porteros
    print("Obteniendo datos de porteros...")
    gk = "https://fbref.com/en/comps/Big5/keepers/players/Big-5-European-Leagues-Stats"
    advgk = "https://fbref.com/en/comps/Big5/keepersadv/players/Big-5-European-Leagues-Stats"
    
    df_gk = get_df(gk)
    df_advgk = get_df(advgk)
    
    df_gk.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
    df_advgk.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
    
    df_gk = df_gk.reset_index(drop=True)
    df_advgk = df_advgk.reset_index(drop=True)
    
    # Extraer datos de porteros desde el dataframe principal
    df_goalkeepers = df[df['Pos'].str.contains("GK")].reset_index().iloc[:, 1:].copy()
    df_gk['Pos'] = df_gk['Pos'].astype(str)
    df_gk = df_gk[df_gk['Pos'].str.contains('GK')]
    df_gk = df_gk.reset_index().iloc[:, 1:]
    df_gk = df_gk.rename(columns={'PKatt': 'PKsFaced'})
    
    df_goalkeepers = df_goalkeepers.join(df_gk.iloc[:, 11:26].astype(float), lsuffix='.1', rsuffix='.2')
    df_goalkeepers = df_goalkeepers.rename(columns={'GA': 'GA', 'GA90': 'GA90', 'SoTA': 'SoTA', 'Saves': 'Saves', 'Save%.1': 'Save%', 'W': 'W', 'D': 'D', 'L': 'L', 'CS': 'CS', 'CS%': 'CS%', 'PKsFaced': 'PKsFaced', 'PKA': 'PKA', 'PKsv': 'PKsv', 'PKm': 'PKm', 'Save%.2': 'PKSave%', })
    
    df_advgk['Pos'] = df_advgk['Pos'].astype(str)
    df_advgk = df_advgk[df_advgk['Pos'].str.contains('GK')]
    df_advgk = df_advgk.reset_index().iloc[:, 1:]
    df_goalkeepers = df_goalkeepers.join(df_advgk.iloc[:, 9:20].astype(float).rename(columns={'PKA': 'PKGA', 'FK': 'FKGA', 'CK': 'CKGA', 'OG': 'OGA', 'PSxG': 'PSxG', 'PSxG/SoT': 'PSxG/SoT', 'PSxG+/-': 'PSxG+/-', '/90': 'PSxG+/- /90', 'Cmp': 'LaunchCmp', 'Att': 'LaunchAtt', 'Cmp%': 'LaunchPassCmp%'}))
    df_goalkeepers = df_goalkeepers.join(df_advgk.iloc[:, 20:24].astype(float).rename(columns={'Att': 'PassAtt', 'Thr': 'PassThr', 'Launch%': 'PassesLaunch%', 'AvgLen': 'AvgLenLaunch'}))
    df_goalkeepers = df_goalkeepers.join(df_advgk.iloc[:, 24:33].astype(float).rename(columns={'Att': 'GoalKicksAtt', 'Launch%': 'GoalKicksLaunch%', 'AvgLen': 'AvgLen', 'Opp': 'OppCrs', 'Stp': 'StpCrs', 'Stp%': 'CrsStp%', '#OPA': '#OPA', '#OPA/90': '#OPA/90', 'AvgDist': 'AvgDistOPA'}))
    
    # Obtener datos de equipos
    print("Obteniendo datos de equipos...")
    team_standard = "https://fbref.com/en/comps/Big5/stats/squads/Big-5-European-Leagues-Stats"
    team_poss = "https://fbref.com/en/comps/Big5/possession/squads/Big-5-European-Leagues-Stats"
    
    df_team_std = get_df(team_standard)
    df_team_poss = get_df(team_poss)
    
    df_team_std = df_team_std.reset_index(drop=True)
    df_team_poss = df_team_poss.reset_index(drop=True)
    
    # Crear DataFrame de equipos
    df_teams = df_team_std.iloc[:, 0:30].copy()
    
    # Calcular toques por equipo por 90 minutos
    df_teams['TeamTouches90'] = float(0.0)
    for i in range(len(df_teams)):
        try:
            df_teams.iloc[i, 30] = float(df_team_poss.iloc[i, 5]) / float(df_team_poss.iloc[i, 4])
        except:
            df_teams.iloc[i, 30] = 0
    
    # Limpiar minutos
    for j in range(0, len(df_teams)):
        try:
            df_teams.at[j, 'Min'] = df_teams.at[j, 'Min'].replace(',', '')
        except:
            pass
    
    df_teams.iloc[:, 7:] = df_teams.iloc[:, 7:].apply(pd.to_numeric, errors='coerce')
    
    # Obtener datos de oponentes
    print("Obteniendo datos de oponentes...")
    opp_poss = "https://fbref.com/en/comps/Big5/possession/squads/Big-5-European-Leagues-Stats"
    
    df_opp_poss = get_opp_df(opp_poss)
    df_opp_poss = df_opp_poss.reset_index(drop=True)
    
    # Crear DataFrame de oponentes
    df_opp = df_opp_poss.iloc[:, 0:15].copy()
    df_opp = df_opp.rename(columns={'Touches': 'Opp Touches'})
    df_opp = df_opp.reset_index()
    
    # Añadir toques de oponentes al DataFrame de equipos
    df_teams['Opp Touches'] = 1
    for i in range(min(len(df_teams), len(df_opp))):
        try:
            df_teams['Opp Touches'].iloc[i] = df_opp['Opp Touches'].iloc[i]
        except:
            pass
    
    df_teams = df_teams.rename(columns={'Min': 'Team Min'})
    
    # Aplicar las transformaciones a jugadores no porteros
    print("Procesando datos de jugadores de campo...")
    df_outfield = df[~df['Pos'].str.contains("GK")].reset_index().iloc[:, 1:].copy()
    df_outfield = limpiar_posiciones(df_outfield)
    
    # Calcular estadísticas por 90 minutos para jugadores de campo
    df_outfield['90s'] = df_outfield['Min'] / 90
    df_90s = df_outfield.copy()
    
    # Calcular estadísticas por 90 minutos
    for i in range(10, df_90s.shape[1]):
        col = df_90s.columns[i]
        if col not in ['90s', 'Country', 'Pos', 'Player', 'Nation', 'Squad', 'Comp', 'Born', 'Main Position', 'Position2']:
            try:
                df_90s.iloc[:, i] = df_90s.iloc[:, i] / df_90s['90s']
            except:
                pass
    
    df_90s = df_90s.iloc[:, 10:].add_suffix('Per90')
    df_outfield_final = df_outfield.join(df_90s)
    
    # Limpiar edades
    for i in range(len(df_outfield_final)):
        try:
            if isinstance(df_outfield_final['Age'].iloc[i], str):
                df_outfield_final['Age'].iloc[i] = int(df_outfield_final['Age'].iloc[i][:2])
        except:
            pass
    
    # Hacer lo mismo para porteros
    print("Procesando datos de porteros...")
    df_goalkeepers['90s'] = df_goalkeepers['Min'] / 90
    df_90s_gk = df_goalkeepers.copy()
    
    for i in range(10, df_90s_gk.shape[1]):
        col = df_90s_gk.columns[i]
        if col not in ['90s', 'Country', 'Pos', 'Player', 'Nation', 'Squad', 'Comp', 'Born', 'Main Position', 'Position2']:
            try:
                df_90s_gk.iloc[:, i] = df_90s_gk.iloc[:, i] / df_90s_gk['90s']
            except:
                pass
    
    df_90s_gk = df_90s_gk.iloc[:, 10:].add_suffix('Per90')
    df_goalkeepers_final = df_goalkeepers.join(df_90s_gk)
    
    # Limpiar edades para porteros
    for i in range(len(df_goalkeepers_final)):
        try:
            if isinstance(df_goalkeepers_final['Age'].iloc[i], str):
                df_goalkeepers_final['Age'].iloc[i] = int(df_goalkeepers_final['Age'].iloc[i][:2])
        except:
            pass
    
    # Añadir métricas de equipo a los jugadores
    print("Añadiendo métricas de equipo a jugadores...")
    
    # Función para añadir métricas de equipo
    def add_team_metrics(df, teams_df):
        df['AvgTeamPoss'] = 0.0
        df['OppTouches'] = 1
        df['TeamMins'] = 1
        df['TeamTouches90'] = 0.0
        
        for i in range(len(df)):
            try:
                team_name = df['Squad'].iloc[i]
                team_row = teams_df[teams_df['Squad'] == team_name]
                if not team_row.empty:
                    df.at[i, 'AvgTeamPoss'] = float(team_row['Poss'].values[0])
                    df.at[i, 'OppTouches'] = float(team_row['Opp Touches'].values[0])
                    df.at[i, 'TeamMins'] = float(team_row['Team Min'].values[0])
                    df.at[i, 'TeamTouches90'] = float(team_row['TeamTouches90'].values[0])
            except Exception as e:
                print(f"Error al añadir métricas de equipo para {df['Player'].iloc[i]}: {e}")
        
        # Convertir columnas a numéricas para evitar problemas con cálculos
        numeric_cols = ['Tkl+IntPer90', 'ClrPer90', 'ShBlocksPer90', 'PassBlocksPer90', 'IntPer90', 
                      'DrbTklPer90', 'DrbPastPer90', 'AerialWinsPer90', 'AerialLossPer90', 
                      'DrbPastAttPer90', 'TouchesPer90', 'Tkl+Int', 'Min', 'AvgTeamPoss', 
                      'OppTouches', 'TeamMins', 'TeamTouches90']
        
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Calcular métricas ajustadas por posesión
        try:
            df['pAdjTkl+IntPer90'] = (df['Tkl+IntPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjClrPer90'] = (df['ClrPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjShBlocksPer90'] = (df['ShBlocksPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjPassBlocksPer90'] = (df['PassBlocksPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjIntPer90'] = (df['IntPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjDrbTklPer90'] = (df['DrbTklPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjTklWinPossPer90'] = (df['DrbTklPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjDrbPastPer90'] = (df['DrbPastPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjAerialWinsPer90'] = (df['AerialWinsPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjAerialLossPer90'] = (df['AerialLossPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['pAdjDrbPastAttPer90'] = (df['DrbPastAttPer90'] / (100 - df['AvgTeamPoss'])) * 50
            df['TouchCentrality'] = (df['TouchesPer90'] / df['TeamTouches90']) * 100
            df['Tkl+IntPer600OppTouch'] = df['Tkl+Int'] / (df['OppTouches'] * (df['Min'] / df['TeamMins'])) * 600
            df['pAdjTouchesPer90'] = (df['TouchesPer90'] / df['AvgTeamPoss']) * 50
        except Exception as e:
            print(f"Error al calcular métricas ajustadas: {e}")
        
        # Calcular métricas por toques
        try:
            cols_to_check = ['Carries', 'Touches', 'ProgCarries', 'PassesCompleted', 'ProgPasses']
            if all(col in df.columns for col in cols_to_check):
                df['Carries'] = pd.to_numeric(df['Carries'], errors='coerce')
                df['Touches'] = pd.to_numeric(df['Touches'], errors='coerce')
                df['ProgCarries'] = pd.to_numeric(df['ProgCarries'], errors='coerce')
                df['PassesCompleted'] = pd.to_numeric(df['PassesCompleted'], errors='coerce')
                df['ProgPasses'] = pd.to_numeric(df['ProgPasses'], errors='coerce')
                
                # Evitar división por cero
                df['CarriesPer50Touches'] = df.apply(lambda x: 50 * x['Carries'] / x['Touches'] if x['Touches'] > 0 else 0, axis=1)
                df['ProgCarriesPer50Touches'] = df.apply(lambda x: 50 * x['ProgCarries'] / x['Touches'] if x['Touches'] > 0 else 0, axis=1)
                df['ProgPassesPer50CmpPasses'] = df.apply(lambda x: 50 * x['ProgPasses'] / x['PassesCompleted'] if x['PassesCompleted'] > 0 else 0, axis=1)
        except Exception as e:
            print(f"Error al calcular métricas por toques: {e}")
        
        return df
    
    # Aplicar métricas de equipo
    df_outfield_final = add_team_metrics(df_outfield_final, df_teams)
    df_goalkeepers_final = add_team_metrics(df_goalkeepers_final, df_teams)
    
    # Añadir métrica OPA específica para porteros
    if '#OPAPer90' in df_goalkeepers_final.columns and 'AvgTeamPoss' in df_goalkeepers_final.columns:
        df_goalkeepers_final['pAdj#OPAPer90'] = (df_goalkeepers_final['#OPAPer90'] / (100 - df_goalkeepers_final['AvgTeamPoss'])) * 50
    
    # Añadir posiciones desde un archivo externo
    print("Añadiendo datos de posiciones principales...")
    try:
        # Intentar cargar datos de posiciones principales
        tm_pos = pd.read_csv('TransfermarktPositions-Jase_Ziv83.csv')
        
        # Fusionar datos de posiciones
        df_outfield_final = pd.merge(df_outfield_final, tm_pos, on='Player', how='left')
        df_goalkeepers_final = pd.merge(df_goalkeepers_final, tm_pos, on='Player', how='left')
        
        # Establecer posición principal para porteros
        df_goalkeepers_final['Main Position'] = 'Goalkeeper'
    except Exception as e:
        print(f"No se pudieron obtener datos de posiciones: {e}")
        # Crear Main Position y Position2 manualmente
        if 'Main Position' not in df_outfield_final.columns:
            df_outfield_final['Main Position'] = df_outfield_final['Pos'].apply(
                lambda x: x.split(',')[0].strip() if isinstance(x, str) and ',' in x else x
            )
        if 'Position2' not in df_outfield_final.columns:
            df_outfield_final['Position2'] = df_outfield_final['Pos'].apply(
                lambda x: x.split(',')[1].strip() if isinstance(x, str) and ',' in x and len(x.split(',')) > 1 else ''
            )
        df_goalkeepers_final['Main Position'] = 'Goalkeeper'
        df_goalkeepers_final['Position2'] = 'Goalkeeper'
    
    # Procesar los dataframes finales
    print("Aplicando transformaciones finales...")
    df_outfield_final = procesar_dataframe(df_outfield_final)
    df_goalkeepers_final = procesar_dataframe(df_goalkeepers_final)
    df_teams = procesar_dataframe(df_teams, is_team_df=True)  # Añadir parámetro is_team_df=True
    
    # Guardar los dataframes finales
    print("Guardando archivos CSV finales...")
    df_outfield_final.to_csv(f"{root}{final_nongk}", index=False, encoding='utf-8-sig')
    df_goalkeepers_final.to_csv(f"{root}{final_gk}", index=False, encoding='utf-8-sig')
    df_teams.to_csv(f"{root}{final_teams}", index=False, encoding='utf-8-sig')
    
    print(f"Proceso completado con éxito. Los archivos se han guardado en: {root}")
    print(f"1. {final_nongk} - Datos de jugadores de campo")
    print(f"2. {final_gk} - Datos de porteros")
    print(f"3. {final_teams} - Datos de equipos")
    
    return df_outfield_final, df_goalkeepers_final, df_teams

# Ejecutar la función principal
if __name__ == "__main__":
    extraer_datos_fbref()

Iniciando extracción de datos de FBRef...
Obteniendo datos de jugadores...


IndexError: list index out of range