# FBREF DATA GATHERING

# LIBRARIES

In [196]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [197]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# BASE URLS - PLAYER DATA BIG 5 LEAGUES

In [198]:
lg_id = "Big5"
lg_str = "Big-5-European"

In [199]:
standard = f"https://fbref.com/en/comps/{lg_id}/stats/players/{lg_str}-Stats"
shooting = f"https://fbref.com/en/comps/{lg_id}/shooting/players/{lg_str}-Stats"
passing = f"https://fbref.com/en/comps/{lg_id}/passing/players/{lg_str}-Stats"
pass_types = f"https://fbref.com/en/comps/{lg_id}/passing_types/players/{lg_str}-Stats"
gsca = f"https://fbref.com/en/comps/{lg_id}/gca/players/{lg_str}-Stats"
defense = f"https://fbref.com/en/comps/{lg_id}/defense/players/{lg_str}-Stats"
misc = f"https://fbref.com/en/comps/{lg_id}/misc/players/{lg_str}-Stats"

In [207]:
standard

'https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Stats'

# RENAME DICTS

In [200]:

base_rename_columns_dict = {
    'Unnamed: 0_level_0_Rk': 'Rk',
    'Unnamed: 1_level_0_Player': 'Player',
    'Unnamed: 2_level_0_Nation': 'Nation',
    'Unnamed: 3_level_0_Pos': 'Pos',
    'Unnamed: 4_level_0_Squad': 'Squad',
    'Unnamed: 5_level_0_Comp': 'Comp',
    'Unnamed: 6_level_0_Age': 'Age',
    'Unnamed: 7_level_0_Born': 'Born',
    'Unnamed: 8_level_0_90s': '90s',
    'Unnamed: 25_level_0_Matches': 'Matches'}

standard_rename_columns_dict = {
    'Playing Time_MP': 'Playing Time_MP',
    'Playing Time_Starts': 'Playing Time_Starts',
    'Playing Time_Min': 'Playing Time_Min',
    'Playing Time_90s': 'Playing Time_90s',
    'Performance_Gls': 'Goals',
    'Performance_Ast': 'Assists',
    'Performance_G+A': 'Goals + Assists',
    'Performance_G-PK': 'Goals - PK',
    'Performance_PK': 'Penalty goals',
    'Performance_PKatt': 'Penalties attempted',
    'Performance_CrdY': 'Performance_CrdY',
    'Performance_CrdR': 'Performance_CrdR',
    'Expected_xG': 'xG',
    'Expected_npxG': 'npxG',
    'Expected_xAG': 'xAG',
    'Expected_npxG+xAG': 'npxG+xAG',
    'Progression_PrgC': 'Progressive carries',
    'Progression_PrgP': 'Progressive passes',
    'Progression_PrgR': 'Progressive R?',
    'Unnamed: 37_level_0_Matches': 'Matches',
}


shooting_rename_columns_dict = {
    'Unnamed: 8_level_0_90s': '90s',
    'Standard_Gls': 'Goals',
    'Standard_Sh': 'Shots',
    'Standard_SoT': 'Shots on target',
    'Standard_SoT%': 'Shots on target%',
    'Standard_Sh/90': 'Shots p90',
    'Standard_SoT/90': 'Shots on target p90',
    'Standard_G/Sh': 'Goals per shot',
    'Standard_G/SoT': 'Goals per shot on target',
    'Standard_Dist': 'Shot distance',
    'Standard_FK': 'Standard_FK',
    'Standard_PK': 'Standard_PK',
    'Standard_PKatt': 'Penalties attempted',
    'Expected_xG': 'xG',
    'Expected_npxG': 'npxG',
    'Expected_npxG/Sh': 'npxG/Sh',
    'Expected_G-xG': 'G-xG',
    'Expected_np:G-xG': 'npG-npxG',
    'Unnamed: 26_level_0_Matches': 'Matches'
}


passing_rename_columns_dict = {
    'Unnamed: 8_level_0_90s': '90s',
    'Total_Cmp': 'Passes completed',
    'Total_Att': 'Passes attempted',
    'Total_Cmp%': 'Pass completion%',
    'Total_TotDist': 'Passes total distance',
    'Total_PrgDist': 'Passes progressive distance',
    'Short_Cmp': 'Short passes completion',
    'Short_Att': 'Short passes attempted',
    'Short_Cmp%': 'Short passes completion%',
    'Medium_Cmp': 'Medium passes completion',
    'Medium_Att': 'Medium passes attempted',
    'Medium_Cmp%': 'Medium passes completion%',
    'Long_Cmp': 'Long passes completed',
    'Long_Att': 'Long passes attempted',
    'Long_Cmp%': 'Long passes completion%',
    'Unnamed: 23_level_0_Ast': 'Assists',
    'Unnamed: 24_level_0_xAG': 'xAG',
    'Expected_xA': 'xA',
    'Expected_A-xAG': 'xA-xAG',
    'Unnamed: 27_level_0_KP': 'Key passes',
    'Unnamed: 28_level_0_1/3': 'P1/3',
    'Unnamed: 29_level_0_PPA': 'PPA',
    'Unnamed: 30_level_0_CrsPA': 'CrsPA',
    'Unnamed: 31_level_0_PrgP': 'PrgP',
    'Unnamed: 32_level_0_Matches': 'Matches'
}

pass_types_rename_columns_dict = {
    'Unnamed: 9_level_0_Att': 'Att',
    'Pass Types_Live': 'Pass Types_Live',
    'Pass Types_Dead': 'Pass Types_Dead',
    'Pass Types_FK': 'Pass Types_FK',
    'Pass Types_TB': 'Pass Types_TB',
    'Pass Types_Sw': 'Pass Types_Sw',
    'Pass Types_Crs': 'Pass Types_Crs',
    'Pass Types_TI': 'Pass Types_TI',
    'Pass Types_CK': 'Pass Types_CK',
    'Corner Kicks_In': 'Corner Kicks_In',
    'Corner Kicks_Out': 'Corner Kicks_Out',
    'Corner Kicks_Str': 'Corner Kicks_Str',
    'Outcomes_Cmp': 'Outcomes_Cmp',
    'Outcomes_Off': 'Outcomes_Off',
    'Outcomes_Blocks': 'Outcomes_Blocks',
    'Unnamed: 24_level_0_Matches': 'Matches'
}

gsca_rename_columns_dict = {
    'Unnamed: 8_level_0_90s': '90s',
    'SCA_SCA': 'SCA_SCA',
    'SCA_SCA90': 'SCA_SCA90',
    'SCA Types_PassLive': 'SCA Types_PassLive',
    'SCA Types_PassDead':  'SCA Types_PassDead',
    'SCA Types_TO': 'SCA Types_TO',
    'SCA Types_Sh': 'SCA Types_Sh',
    'SCA Types_Fld': 'SCA Types_Fld',
    'SCA Types_Def': 'SCA Types_Def',
    'GCA_GCA': 'GCA_GCA',
    'GCA_GCA90': 'GCA_GCA90',
    'GCA Types_PassLive': 'GCA Types_PassLive',
    'GCA Types_PassDead': 'GCA Types_PassDead',
    'GCA Types_TO': 'GCA Types_TO',
    'GCA Types_Sh': 'GCA Types_Sh',
    'GCA Types_Fld': 'GCA Types_Fld',
    'GCA Types_Def': 'GCA Types_Def',
    'Unnamed: 25_level_0_Matches': 'Matches'
}

defense_rename_columns_dict = {
    'Unnamed: 21_level_0_Int': 'Interceptions',
    'Unnamed: 22_level_0_Tkl+Int': 'Tkl+Int',
    'Unnamed: 23_level_0_Clr': 'Clearances',
    'Unnamed: 24_level_0_Err': 'Errors',
}


misc_rename_columns_dict = {
    'Unnamed: 8_level_0_90s': '90s',
    'Performance_CrdY': 'Performance_CrdY',
    'Performance_CrdR': 'Performance_CrdR',
    'Performance_2CrdY': 'Performance_2CrdY',
    'Performance_Fls': 'Performance_Fls',
    'Performance_Fld':  'Performance_Fld',
    'Performance_Off': 'Performance_Off',
    'Performance_Crs': 'Performance_Crs',
    'Performance_Int': 'Performance_Int',
    'Performance_TklW': 'Performance_TklW',
    'Performance_PKwon': 'Performance_PKwon',
    'Performance_PKcon': 'Performance_PKcon',
    'Performance_OG': 'Performance_OG',
    'Performance_Recov': 'Recoveries',
    'Aerial Duels_Won': 'Aerial Duels Won',
    'Aerial Duels_Lost': 'Aerial Duels Lost',
    'Aerial Duels_Won%': 'Aerial Duels_Won%',
    'Unnamed: 25_level_0_Matches': 'Matches'
}

# FBREF STRUCTURE DICT

In [201]:
structure_dict = {

    'standard': {
        'url': standard,
        'rename_dict': standard_rename_columns_dict
                },
    'shooting': {
        'url': shooting,
        'rename_dict': shooting_rename_columns_dict
            },
    'passing': {
        'url': passing,
        'rename_dict': passing_rename_columns_dict
            },
    'pass_types': {
        'url': pass_types,
        'rename_dict': pass_types_rename_columns_dict
            },
    'gsca': {
        'url': gsca,
        'rename_dict': gsca_rename_columns_dict
            },
    'defense': {
        'url': defense,
        'rename_dict': defense_rename_columns_dict
            },
    'misc': {
        'url': misc,
        'rename_dict': misc_rename_columns_dict
            }
}

# FUNCTIONS

In [202]:
def read_data_from_fbref(stat_type):
    df = pd.read_html(stat_type)[0]
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    #df = df[df.columns[:-1]]
    return df

In [203]:
def transform_data(df, rename_dict):

    df = df.rename(columns=base_rename_columns_dict)
    df = df.rename(columns=rename_dict)

    if 'Matches' in df.columns:
        df = df.drop(columns=['Matches'])
    if 'Nation' in df.columns:
        df['Nation'] = df['Nation'].fillna('Unknown').astype(str)
        df['Nation'] = df['Nation'].apply(lambda x: x.split(' ', 1)[-1] if ' ' in x else x)
    if 'Comp' in df.columns:
        df['Comp'] = df['Comp'].apply(lambda x: x.split(' ', 1)[-1] if ' ' in x else x)

    df = df[df.columns[:-1]]

    df = df[df['Player'] != 'Player']

    df = df.drop_duplicates(subset=['Player', 'Age', 'Nation'], keep='last')

    df[df.columns[7:]] = df[df.columns[7:]].astype(float).apply(pd.to_numeric, errors='coerce')

    return df

In [204]:
def create_stats_data(stats_type='standard'):

    """
    Options for stats_type parameter:

    'standard', 'shooting', 'passing',
    'pass_types', 'gsca', 'defense',
    'poss', 'misc'
    """

    url = structure_dict[stats_type]['url']

    rename_columns_dict = structure_dict[stats_type]['rename_dict']

    df = transform_data(read_data_from_fbref(url), rename_columns_dict)

    return df

# Example of extraction of defense player data for Big5 data

In [206]:
std = create_stats_data('standard').to_excel('standard.xlsx', index=False)
sh = create_stats_data('shooting').to_excel('shooting.xlsx', index=False)
pass_ = create_stats_data('passing').to_excel('passing.xlsx', index=False)
pt = create_stats_data('pass_types').to_excel('pass_types.xlsx', index=False)
def_ = create_stats_data('defense').to_excel('defense.xlsx', index=False)
miscc = create_stats_data('misc').to_excel('misc.xlsx', index=False)
gs = create_stats_data('gsca').to_excel('gsca.xlsx', index=False)

# Next steps: (MERGE DATA AND SELECT USEFUL COLUMNS)

List of columns:

  -

  -
  
  -
