In [None]:
# Python ≥3.5 (ideally)
import platform
import sys, getopt
assert sys.version_info >= (3, 5)
import csv

# Import Dependencies
%matplotlib inline

# Math Operations
import numpy as np
from math import pi

# Datetime
import datetime
from datetime import date
import time

# Data Preprocessing
import pandas as pd
import os
import re
import random
import glob
from io import BytesIO
from pathlib import Path

# Reading directories
import glob
import os

# Working with JSON
import json
from selenium.webdriver.chrome.service import Service

# Web Scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import re

# Data Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


# Progress Bar


# Display in Jupyter
from IPython.display import Image, YouTubeVideo
from IPython.core.display import HTML

# Ignore Warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

print('Setup Complete')

In [None]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# List of stat categories and table IDs
stat_categories = {
    "standard": "stats_standard",
    "shooting": "stats_shooting",
    "passing": "stats_passing",
    "passing_types": "stats_passing_types",
    "possession": "stats_possession",
    "defense": "stats_defense",
    "playing_time": "stats_playing_time",
    "misc": "stats_misc",
    "gca": "stats_gca"
}

# Output folder
output_dir = "seriea_2024_2025_stats"
os.makedirs(output_dir, exist_ok=True)

# Base URL
base_url = "https://fbref.com/en/comps/11/10731/{category}/2024-2025-Serie-A-Stats"

# Configure Selenium headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

# Scrape each table
for category, table_id in stat_categories.items():
    url = base_url.format(category=category)
    print(f"Scraping {category} from {url} ...")

    try:
        driver.get(url)
        time.sleep(5)
        element = driver.find_element(By.ID, f"div_{table_id}")
        table_html = element.get_attribute("innerHTML")
        df = pd.read_html(table_html)[0]

        # Remove duplicate header rows
        df = df[df[df.columns[0]] != df.columns[0]]

        # Save to CSV
        df.to_csv(os.path.join(output_dir, f"{category}.csv"), index=False)
        print(f"✅ Saved {category}.csv")
    except Exception as e:
        print(f"❌ Failed to scrape {category}: {e}")

driver.quit()
print("✅ Done scraping all tables.")


In [3]:
# Python ≥3.5 Check
import sys
assert sys.version_info >= (3, 5)

# --- Core Imports ---
import os
import re
import time
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display Settings
sns.set_theme(style="whitegrid")
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# --- Web Scraping ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# --- Progress Bar ---
from tqdm import tqdm

# --- Define working directory explicitly ---
wd = '/Users/samueldworetzky/projects/Github/soccer-analytics'
out_dir = os.path.join(wd, 'seriea_2024_2025_stats')
os.makedirs(out_dir, exist_ok=True)

# --- WebDriver Options ---
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# --- Team & Season Setup ---
lst_teams_sa_2425 = [
    'ac-milan', 'atalanta', 'bologna', 'cagliari', 'como',
    'empoli', 'fiorentina', 'genoa', 'hellas-verona', 'inter-milan',
    'juventus', 'lazio', 'lecce', 'monza', 'napoli',
    'parma', 'roma', 'torino', 'udinese', 'venezia'
]
season = '2024-2025'
competition = 'serie-a'

# --- Scraping Function ---
def scrape_capology_season_prev(lst_teams, season, comp):
    print(f'\n📥 Scraping {comp} for the {season} season...\n')
    dfs_players = []

    for team in lst_teams:
        file_path = os.path.join(out_dir, f'{team}_{comp}_{season}.csv')

        if not os.path.exists(file_path):
            url = f'https://www.capology.com/club/{team}/salaries/{season}/'
            print(f'🌐 Scraping {team} ({season})...')

            service = Service(ChromeDriverManager().install())
            wd_driver = webdriver.Chrome(service=service, options=options)
            wd_driver.get(url)
            time.sleep(5)
            html = wd_driver.page_source
            wd_driver.quit()

            try:
                raw_df = pd.read_html(html, header=0)[1]
                raw_df.dropna(how='all', inplace=True)
                raw_df.columns = raw_df.columns.str.strip()
                df = raw_df.reset_index(drop=True)
                df.drop(columns=['Rank'], inplace=True, errors='ignore')
            except Exception as e:
                print(f'⚠️ Failed to parse {team}: {e}')
                continue

            df['Team'] = team.replace('-', ' ').title().replace('Fc', 'FC').replace('Ac', 'AC')
            df['League'] = comp.replace('-', ' ').title()
            df['Season'] = season

            df.to_csv(file_path, index=False)
            dfs_players.append(df)
            print(f'✅ Saved: {file_path}')
        else:
            print(f'📂 Already exists: {file_path}')
            df = pd.read_csv(file_path)
            dfs_players.append(df)

    if dfs_players:
        df_all = pd.concat(dfs_players, ignore_index=True)
        all_path = os.path.join(out_dir, f'all_{comp}_{season}.csv')
        df_all.to_csv(all_path, index=False)
        print(f'\n✅ All data saved: {all_path} — {len(df_all)} total rows\n')
        return df_all
    else:
        print('⚠️ No data was collected.')
        return pd.DataFrame()

# --- Run Scraper ---
df_players_all = scrape_capology_season_prev(lst_teams_sa_2425, season, competition)



📥 Scraping serie-a for the 2024-2025 season...

🌐 Scraping ac-milan (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/ac-milan_serie-a_2024-2025.csv
🌐 Scraping atalanta (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/atalanta_serie-a_2024-2025.csv
🌐 Scraping bologna (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/bologna_serie-a_2024-2025.csv
🌐 Scraping cagliari (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/cagliari_serie-a_2024-2025.csv
🌐 Scraping como (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/como_serie-a_2024-2025.csv
🌐 Scraping empoli (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/empoli_serie-a_2024-2025.csv
🌐 Scraping fiorentina (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/fiorentina_serie-a_2024-2025.csv
🌐 Scraping genoa (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/genoa_serie-a_2024-2025.csv
🌐 Scraping hellas-verona (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/hellas-verona_serie-a_2024-2025.csv
🌐 Scraping inter-milan (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/inter-milan_serie-a_2024-2025.csv
🌐 Scraping juventus (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/juventus_serie-a_2024-2025.csv
🌐 Scraping lazio (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/lazio_serie-a_2024-2025.csv
🌐 Scraping lecce (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/lecce_serie-a_2024-2025.csv
🌐 Scraping monza (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/monza_serie-a_2024-2025.csv
🌐 Scraping napoli (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/napoli_serie-a_2024-2025.csv
🌐 Scraping parma (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/parma_serie-a_2024-2025.csv
🌐 Scraping roma (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/roma_serie-a_2024-2025.csv
🌐 Scraping torino (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/torino_serie-a_2024-2025.csv
🌐 Scraping udinese (2024-2025)...


  raw_df = pd.read_html(html, header=0)[1]


✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/udinese_serie-a_2024-2025.csv
🌐 Scraping venezia (2024-2025)...
✅ Saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/venezia_serie-a_2024-2025.csv

✅ All data saved: /Users/samueldworetzky/projects/Github/soccer-analytics/seriea_2024_2025_stats/all_serie-a_2024-2025.csv — 740 total rows



  raw_df = pd.read_html(html, header=0)[1]
