In [None]:
import pandas as pd
import ssl 
import os
import logging
from pathlib import Path
import time

import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")
ssl._create_default_https_context = ssl._create_unverified_context


Table: 'career_avg_batsman'

In [None]:
def career_avg_batsman(player_ids):

    career_avg_batsman_merged = pd.DataFrame()

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
        
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 2:
                career_avg_batsman = tables[2]
                career_avg_batsman = career_avg_batsman[["Span","Inns","Runs","HS","Ave","SR","100","50","0","4s","6s"]]
                career_avg_batsman.insert(0, "P_id", p_id, True)
                career_avg_batsman_merged = pd.concat([career_avg_batsman_merged, career_avg_batsman], ignore_index=True)
                
            time.sleep(0.1)

        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Career_avg_batsman.csv')
    career_avg_batsman_merged.to_csv(filepath, index_label="row_id")  
    print(career_avg_batsman_merged)


Table: 'vs_country_batsman'

In [None]:
def vs_country_batsman(player_ids):

    vs_country_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                vs_country_batsman = tables[3]
                vs_country_batsman = vs_country_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                vs_country_batsman.rename(columns={'Grouping': 'Country'}, inplace=True)
                vs_country_batsman.insert(0, "P_id", p_id, True)
                
                countries = ['v Pakistan', 'v Australia', 'v England', 'v New Zealand', 'v West Indies', 'v Sri Lanka', 'v Bangladesh', 'v South Africa']
                select_vs_country_batsman = vs_country_batsman.loc[vs_country_batsman['Country'].isin(countries)]
                
                vs_country_batsman_merged = pd.concat([vs_country_batsman_merged, select_vs_country_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Vs_country_batsman.csv')
    vs_country_batsman_merged.to_csv(filepath, index_label="row_id")
    print(vs_country_batsman_merged)

Table: 'home_vs_away_batsman'

In [None]:
def home_vs_away_batsman(player_ids):

    home_vs_away_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                home_vs_away_batsman = tables[3]
                home_vs_away_batsman = home_vs_away_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                home_vs_away_batsman.rename(columns={'Grouping': 'Venue'}, inplace=True)
                home_vs_away_batsman.insert(0, "P_id", p_id, True)
                
                venues = ['home', 'away', 'neutral']
                select_home_vs_away_batsman = home_vs_away_batsman.loc[home_vs_away_batsman['Venue'].isin(venues)]
                
                home_vs_away_batsman_merged = pd.concat([home_vs_away_batsman_merged, select_home_vs_away_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Home_vs_away_batsman.csv')
    home_vs_away_batsman_merged.to_csv(filepath, index_label="row_id")
    print(home_vs_away_batsman_merged)

Table: 'yearly_stats_batsman'

In [None]:
def yearly_stats_batsman(player_ids):

    yearly_stats_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                yearly_stats_batsman = tables[3]
                yearly_stats_batsman = yearly_stats_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
                yearly_stats_batsman.insert(0, "P_id", p_id, True)
                
                select_yearly_stats_batsman = yearly_stats_batsman.loc[yearly_stats_batsman['Year'].str.startswith('year ', na=False)]
                yearly_stats_batsman_merged = pd.concat([yearly_stats_batsman_merged, select_yearly_stats_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Yearly_stats_batsman.csv')
    yearly_stats_batsman_merged.to_csv(filepath, index_label="row_id")
    print(yearly_stats_batsman_merged)

#Main function below:

In [None]:
#253802: Virat Kohli (BAT)
#34102: Rohit Gurunath Sharma (BAT)
#422108: Kannaur Lokesh Rahul (BAT)
#28235: Shikhar Dhawan (BAT)
#28081 : Mahendra Singh Dhoni (BAT)
#234675: Ravindrasinh Anirudhsinh Jadeja (ALL)
#625371: Hardik Himanshu Pandya (ALL)
#26421: Ravichandran Ashwin (ALL)

player_ids=['253802', '34102', '422108', '28235', '28081', '234675', '625371', '26421']

#Removes garbage files
os.system("rm -f Csv/*_batsman.csv")

print("\ncareer_avg_batsman:")
career_avg_batsman(player_ids)

print("\nvs_country_batsman:")
vs_country_batsman(player_ids)

print("\nhome_vs_away_batsman:")
home_vs_away_batsman(player_ids)

print("\nyearly_stats_batsman:")
yearly_stats_batsman(player_ids)


