In [1]:
import pandas as pd
import requests

from selenium import webdriver
from seleniumbase import SB
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import codecs
import re
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def custom_join(columns):
    col1, col2 = columns
    if col1.startswith("Unnamed") and col2.startswith("Unnamed"):
        return col1
    elif col1.startswith("Unnamed"):
        return col2
    elif col2.startswith("Unnamed"):
        return col1
    else:
        return f"{col1}-{col2}"

In [3]:
def get_html_source(url, keyword="Premier League"):
    with SB(uc=True, test=True) as sb:
        sb.driver.uc_open_with_reconnect(url, 3)
        if not sb.is_text_visible(keyword):
            sb.driver.uc_open_with_reconnect(url, 4)
        sb.assert_text(keyword, timeout=3)
        html_source = sb.get_page_source().replace('<!--','').replace('-->','')
    return html_source


In [4]:
def get_table_df(html_source, table_id):
    soup = BeautifulSoup(html_source, 'html.parser')
    table = soup.find('table', id=table_id)
    df = pd.read_html(str(table))[0]
    return df

In [5]:
def clean_table(df, cols_drop_list=['Rk', 'Nation', 'Born', 'Matches']):
    df.columns = df.columns.map(custom_join)
    df = df.dropna(subset=['Rk'])
    df = df[df["Player"] != "Player"]
    df = df.groupby(["Player", "Born"])
    df = df.drop(cols_drop_list, axis=1)
    return df

In [6]:
def save_df(filename, is_excel):
    if is_excel:
        df.to_excel(filename, index=False)
    else:
        df.to_csv(filename, index=False)

In [7]:
url = "https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats"
keyword = "Big 5 European Leagues"
table_id = "stats_misc"
cols_drop_list=['Rk', 'Nation', 'Born', 'Matches']

html_source = get_html_source(url, keyword)
df = get_table_df(html_source, table_id)
df = clean_table(df, cols_drop_list)
df.head()



Unnamed: 0,Player,Pos,Squad,Comp,Age,90s,Performance-CrdY,Performance-CrdR,Performance-2CrdY,Performance-Fls,...,Performance-Crs,Performance-Int,Performance-TklW,Performance-PKwon,Performance-PKcon,Performance-OG,Performance-Recov,Aerial Duels-Won,Aerial Duels-Lost,Aerial Duels-Won%
0,Max Aarons,DF,Bournemouth,eng Premier League,23,13.7,1,0,0,12,...,13,8,19,0,1,0,75,5,11,31.3
1,Brenden Aaronson,"MF,FW",Union Berlin,de Bundesliga,22,14.1,3,1,1,15,...,22,2,18,0,0,0,88,13,16,44.8
2,Paxten Aaronson,MF,Eint Frankfurt,de Bundesliga,19,1.1,0,0,0,6,...,0,0,2,0,0,0,5,3,0,100.0
3,Keyliane Abdallah,FW,Marseille,fr Ligue 1,17,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
4,Yunis Abdelhamid,DF,Reims,fr Ligue 1,35,30.9,5,0,0,26,...,3,39,35,0,0,1,149,61,37,62.2


In [9]:
save_df("Big_5_23_24_Misc.xlsx")