In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
import time

# --- CONFIGURATION (All-round) ---
BASE_URL = "https://stats.espncricinfo.com/ci/engine/stats/index.html"
ALLROUND_URL = f"{BASE_URL}?class=1;orderby=runs;template=results;type=allround"
OUTPUT_FILE = 'AllRounder_Data.csv'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
np.random.seed(42)

# --- HELPER FUNCTIONS ---
def extract_headers_robustly(table):
    table_head = table.find('thead')
    if table_head:
        headers = [th.text.strip() for th in table_head.find_all('th')]
        table_body = table.find('tbody')
        data_rows = table_body.find_all('tr') if table_body else []
        if headers:
            return headers, data_rows

    all_rows = table.find_all('tr')
    header_row = None
    header_row_index = -1

    for i, row in enumerate(all_rows[:5]):
        if row.find('th'):
            header_row = row
            header_row_index = i
            break

    if header_row:
        headers = [cell.text.strip() for cell in header_row.find_all(['th', 'td'])]
        data_rows = all_rows[header_row_index + 1:]
        return headers, data_rows

    return [], []

def scrape_single_page(url, headers):
    """Fetches and parses a single page including Player_ID."""
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException:
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table', class_=['engineTable', 'statsTable'])

    table = None
    for t in tables:
        if t.find('th'):
            table = t
            break

    if not table:
        return None

    headers, rows = extract_headers_robustly(table)
    if not headers:
        return None

    data = []
    for row in rows:
        cells = row.find_all('td')
        if cells:
            row_data = []
            for idx, cell in enumerate(cells):
                text = cell.text.strip()
                if idx == 0:  # Player column
                    a_tag = cell.find('a', href=True)
                    player_id = None
                    if a_tag and a_tag['href']:
                        match = re.search(r'/player/(\d+)\.html', a_tag['href'])
                        if match:
                            player_id = match.group(1)
                    row_data.append(text)
                    row_data.append(player_id)  # Add Player_ID
                else:
                    row_data.append(text)
            data.append(row_data)

    headers.insert(1, "Player_ID")  # Insert Player_ID in header
    df = pd.DataFrame(data, columns=headers)
    return df

def scrape_all_pages(base_url, scrape_type):
    all_dataframes = []
    page = 1
    max_pages_reached = False
    headers = {'User-Agent': USER_AGENT}

    print(f"Starting {scrape_type} pagination scrape...")

    while not max_pages_reached:
        paged_url = f"{base_url};page={page}"
        print(f"  Fetching page {page}...")
        df = scrape_single_page(paged_url, headers)

        if df is None or df.empty:
            print(f"  Page {page} returned no data. Ending scrape.")
            max_pages_reached = True
        else:
            if len(df) < 50:
                max_pages_reached = True
            all_dataframes.append(df)
            page += 1
            time.sleep(1)

    if all_dataframes:
        return pd.concat(all_dataframes, ignore_index=True)
    else:
        return None

# --- CLEANING FUNCTION ---
def clean_and_format_allround_data(df):
    if df is None or df.empty:
        return pd.DataFrame()

    df.columns = df.columns.str.strip().str.replace('.', '', regex=False)
    if '' in df.columns:
        df.rename(columns={'': 'Player'}, inplace=True)
    if 'Player' not in df.columns and len(df.columns) > 0:
        df.rename(columns={df.columns[0]: 'Player'}, inplace=True)

    if 'Player' not in df.columns:
        print("CRITICAL ERROR: 'Player' column could not be identified.")
        return pd.DataFrame()

    df['Player'] = df['Player'].astype(str).map(str.strip)

    col_map = {
        'Span': 'Career_Span',
        'Mat': 'Matches_Played',
        'Runs': 'Total_Runs',
        'Bat Av': 'Batting_Average',
        '100': 'Centuries',
        'Wkts': 'Wickets_Total',
        'Bowl Av': 'Bowling_Average',
        '5': 'Five_Wickets',
        'Ct': 'Catches_Total',
        'St': 'Stumpings_Total',
        'Ave Diff': 'Average_Difference'
    }

    df.rename(columns=col_map, errors='ignore', inplace=True)

    final_cols = ['Player', 'Player_ID', 'Career_Span', 'Matches_Played', 'Total_Runs',
                  'Batting_Average', 'Wickets_Total', 'Bowling_Average', 'Average_Difference',
                  'Centuries', 'Five_Wickets', 'Catches_Total', 'Stumpings_Total']

    df = df[[c for c in final_cols if c in df.columns]].copy()

    numeric_cols = [c for c in df.columns if c not in ['Player', 'Player_ID', 'Career_Span']]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace('-', '0'), errors='coerce').fillna(0)

    df_final = df[df['Matches_Played'] > 0].reset_index(drop=True)
    print(f"Total All-round Records Saved: {len(df_final)}")
    return df_final

# --- MAIN EXECUTION ---
if __name__ == '__main__':
    full_allround_df = scrape_all_pages(ALLROUND_URL, "Detailed All-round")
    if full_allround_df is not None and not full_allround_df.empty:
        print(f"\nTotal records scraped before cleaning: {len(full_allround_df)}")
        final_df = clean_and_format_allround_data(full_allround_df)
        if not final_df.empty:
            final_df.to_csv(OUTPUT_FILE, index=False)
            print(f"âœ… All {len(final_df)} detailed all-round records saved successfully to {OUTPUT_FILE}")
        else:
            print("ðŸ›‘ Cleaning resulted in an empty DataFrame.")
    else:
        print("ðŸ›‘ Scraping failed or returned no data.")


Starting Detailed All-round pagination scrape...
  Fetching page 1...
  Fetching page 2...
  Fetching page 3...
  Fetching page 4...
  Fetching page 5...
  Fetching page 6...
  Fetching page 7...
  Fetching page 8...
  Fetching page 9...
  Fetching page 10...
  Fetching page 11...
  Fetching page 12...
  Fetching page 13...
  Fetching page 14...
  Fetching page 15...
  Fetching page 16...
  Fetching page 17...
  Fetching page 18...
  Fetching page 19...
  Fetching page 20...
  Fetching page 21...
  Fetching page 22...
  Fetching page 23...
  Fetching page 24...
  Fetching page 25...
  Fetching page 26...
  Fetching page 27...
  Fetching page 28...
  Fetching page 29...
  Fetching page 30...
  Fetching page 31...
  Fetching page 32...
  Fetching page 33...
  Fetching page 34...
  Fetching page 35...
  Fetching page 36...
  Fetching page 37...
  Fetching page 38...
  Fetching page 39...
  Fetching page 40...
  Fetching page 41...
  Fetching page 42...
  Fetching page 43...
  Fetching pag