In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import time
import re

# --- CONFIGURATION ---
BASE_URL = "https://stats.espncricinfo.com/ci/engine/stats/index.html"
BAT_URL = f"{BASE_URL}?class=1;orderby=runs;template=results;type=batting"
OUTPUT_FILE = 'Batting_Data.csv'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
np.random.seed(42)

# --- HELPER FUNCTIONS ---

def extract_headers_robustly(table):
    table_head = table.find('thead')
    if table_head:
        headers = [th.text.strip() for th in table_head.find_all('th')]
        table_body = table.find('tbody')
        data_rows = table_body.find_all('tr') if table_body else []
        if headers:
            return headers, data_rows
    all_rows = table.find_all('tr')
    header_row = None
    header_row_index = -1
    for i, row in enumerate(all_rows[:5]):
        if row.find('th'):
            header_row = row
            header_row_index = i
            break
    if header_row:
        headers = [cell.text.strip() for cell in header_row.find_all(['th', 'td'])]
        data_rows = all_rows[header_row_index + 1:]
        return headers, data_rows
    return [], []

def scrape_single_page(url):
    headers = {'User-Agent': USER_AGENT}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table', class_=['engineTable', 'statsTable'])
    table = None
    for t in tables:
        if t.find('th'):
            table = t
            break
    if not table:
        return None

    headers, rows = extract_headers_robustly(table)
    if not headers:
        return None

    data = []
    for row in rows:
        cells = row.find_all('td')
        if cells:
            row_data = [cell.text.strip() for cell in cells]
            # Extract Player ID from link
            link = row.find('a', href=True)
            pid = None
            if link:
                match = re.search(r'player/(\d+)', link['href'])
                if match:
                    pid = match.group(1)
            if len(row_data) == len(headers):
                row_data.append(pid)
                data.append(row_data)

    headers.append('Player_ID')
    return pd.DataFrame(data, columns=headers)

def scrape_all_pages(base_url):
    all_data = []
    page = 1
    while True:
        paged_url = f"{base_url};page={page}"
        print(f"Fetching page {page}...")
        df = scrape_single_page(paged_url)
        if df is None or df.empty:
            print("No more data found. Ending pagination.")
            break
        all_data.append(df)
        page += 1
        time.sleep(1)  # polite delay to not overload server
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    return pd.DataFrame()

def clean_and_format_batting_data(df):
    if df is None or df.empty:
        return pd.DataFrame()

    df.columns = df.columns.str.strip().str.replace('.', '', regex=False)
    if '' in df.columns:
        df.rename(columns={'': 'Player'}, inplace=True)
    if 'Player' not in df.columns and len(df.columns) > 0:
        df.rename(columns={df.columns[0]: 'Player'}, inplace=True)

    df['Player'] = df['Player'].astype(str).map(str.strip)

    col_map = {
        'Span': 'Career_Span',
        'Mat': 'Matches_Played',
        'Inns': 'Innings_Batted',
        'NO': 'Not_Outs',
        'Runs': 'Total_Runs',
        'HS': 'Highest_Score',
        'Ave': 'Batting_Average',
        '100': 'Centuries',
        '50': 'Fifties',
        '0': 'Ducks_Career',
        'Ct': 'Catches',
        'St': 'Stumpings'
    }

    df.rename(columns=col_map, errors='ignore', inplace=True)

    final_cols = ['Player', 'Player_ID', 'Career_Span', 'Matches_Played', 'Innings_Batted',
                  'Not_Outs', 'Total_Runs', 'Highest_Score', 'Batting_Average',
                  'Centuries', 'Fifties', 'Ducks_Career', 'Catches', 'Stumpings']

    df = df[[col for col in final_cols if col in df.columns]].copy()

    numeric_cols = [c for c in df.columns if c not in ['Player', 'Career_Span', 'Highest_Score', 'Player_ID']]

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace('-', '0'), errors='coerce').fillna(0)

    if 'Highest_Score' in df.columns:
        df['Highest_Score'] = df['Highest_Score'].astype(str).str.replace('*', '', regex=False)
        df['Highest_Score'] = pd.to_numeric(df['Highest_Score'], errors='coerce').fillna(0).astype(int)

    df_final = df[df['Matches_Played'] > 0].reset_index(drop=True)
    print(f"Total Batting Records after cleaning: {len(df_final)}")
    return df_final

# --- MAIN EXECUTION ---

if __name__ == '__main__':
    print("Starting full Test Batting scrape with pagination...")
    all_bat_df = scrape_all_pages(BAT_URL)
    if not all_bat_df.empty:
        print(f"Scraped total columns: {all_bat_df.columns.tolist()}")
        final_df = clean_and_format_batting_data(all_bat_df)
        final_df.to_csv(OUTPUT_FILE, index=False)
        print(f"âœ… All pages data saved successfully with Player IDs to {OUTPUT_FILE}")
    else:
        print("ðŸ›‘ No data was scraped.")


Starting full Test Batting scrape with pagination...
Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48