In [1]:
import pandas as pd
import ssl 
import os
import logging
from pathlib import Path
import time

import requests
from bs4 import BeautifulSoup


ssl._create_default_https_context = ssl._create_unverified_context


Table: 'career_avg_batsman'

In [2]:
def career_avg_batsman(player_ids):

    career_avg_batsman_merged = pd.DataFrame()

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
        
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 2:
                career_avg_batsman = tables[2]
                career_avg_batsman = career_avg_batsman[["Span","Inns","Runs","HS","Ave","SR","100","50","0","4s","6s"]]
                career_avg_batsman.insert(0, "P_id", p_id, True)
                career_avg_batsman_merged = pd.concat([career_avg_batsman_merged, career_avg_batsman], ignore_index=True)
                
            time.sleep(0.1)

        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Career_avg_batsman.csv')
    career_avg_batsman_merged.to_csv(filepath, index_label="row_id")  
    print(career_avg_batsman_merged)


Table: 'vs_country_batsman'

In [3]:
def vs_country_batsman(player_ids):

    vs_country_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                vs_country_batsman = tables[3]
                vs_country_batsman = vs_country_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                vs_country_batsman.rename(columns={'Grouping': 'Country'}, inplace=True)
                vs_country_batsman.insert(0, "P_id", p_id, True)
                
                countries = ['v Pakistan', 'v Australia', 'v England', 'v New Zealand', 'v West Indies', 'v Sri Lanka', 'v Bangladesh', 'v South Africa']
                select_vs_country_batsman = vs_country_batsman.loc[vs_country_batsman['Country'].isin(countries)]
                
                vs_country_batsman_merged = pd.concat([vs_country_batsman_merged, select_vs_country_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Vs_country_batsman.csv')
    vs_country_batsman_merged.to_csv(filepath, index_label="row_id")
    print(vs_country_batsman_merged)

Table: 'home_vs_away_batsman'

In [4]:
def home_vs_away_batsman(player_ids):

    home_vs_away_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                home_vs_away_batsman = tables[3]
                home_vs_away_batsman = home_vs_away_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                home_vs_away_batsman.rename(columns={'Grouping': 'Venue'}, inplace=True)
                home_vs_away_batsman.insert(0, "P_id", p_id, True)
                
                venues = ['home', 'away', 'neutral']
                select_home_vs_away_batsman = home_vs_away_batsman.loc[home_vs_away_batsman['Venue'].isin(venues)]
                
                home_vs_away_batsman_merged = pd.concat([home_vs_away_batsman_merged, select_home_vs_away_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Home_vs_away_batsman.csv')
    home_vs_away_batsman_merged.to_csv(filepath, index_label="row_id")
    print(home_vs_away_batsman_merged)

Table: 'yearly_stats_batsman'

In [5]:
def yearly_stats_batsman(player_ids):

    yearly_stats_batsman_merged = pd.DataFrame()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    for p_id in player_ids:
        url = "https://stats.espncricinfo.com/ci/engine/player/" + p_id + ".html?class=2;template=results;type=batting"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            tables = pd.read_html(str(soup))

            if len(tables) > 3:
                yearly_stats_batsman = tables[3]
                yearly_stats_batsman = yearly_stats_batsman[["Grouping","Inns","Runs","HS","Ave","SR"]]
                yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
                yearly_stats_batsman.insert(0, "P_id", p_id, True)
                
                select_yearly_stats_batsman = yearly_stats_batsman.loc[yearly_stats_batsman['Year'].str.startswith('year ', na=False)]
                yearly_stats_batsman_merged = pd.concat([yearly_stats_batsman_merged, select_yearly_stats_batsman], ignore_index=True)
                
            time.sleep(0.1)
        else:
            print(f"Failed to retrieve data for player ID {p_id}, status code: {response.status_code}")

    filepath = Path('Csv/Yearly_stats_batsman.csv')
    yearly_stats_batsman_merged.to_csv(filepath, index_label="row_id")
    print(yearly_stats_batsman_merged)

#Main function below:

In [6]:
#253802: Virat Kohli (BAT)
#34102: Rohit Gurunath Sharma (BAT)
#422108: Kannaur Lokesh Rahul (BAT)
#28235: Shikhar Dhawan (BAT)
#28081 : Mahendra Singh Dhoni (BAT)
#234675: Ravindrasinh Anirudhsinh Jadeja (ALL)
#625371: Hardik Himanshu Pandya (ALL)
#26421: Ravichandran Ashwin (ALL)

player_ids=['253802', '34102', '422108', '28235', '28081', '234675', '625371', '26421']

#Removes garbage files
os.system("rm -f Csv/*_batsman.csv")

career_avg_batsman(player_ids)
vs_country_batsman(player_ids)
home_vs_away_batsman(player_ids)
yearly_stats_batsman(player_ids)




  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))
  tables = pd.read_html(str(soup))


     P_id       Span  Inns   Runs    HS    Ave      SR  100  50   0    4s   6s
0  253802  2008-2023   280  13848   183  58.67   93.58   50  72  16  1294  151
1   34102  2007-2023   254  10709   264  49.12   91.97   31  55  16   994  323
2  422108  2016-2023    70   2820   112  50.35   87.82    7  18   2   224   61
3   28235  2010-2022   164   6793   143  44.11   91.35   17  39   5   842   79
4   28081  2004-2019   297  10773  183*  50.57   87.56   10  73  10   826  229
5  234675  2009-2023   132   2756    87  32.42   85.06    0  13   6   199   54
6  625371  2016-2023    61   1769   92*  34.01  110.35    0  11   4   132   67
7   26421  2010-2023    63    707    65  16.44   86.96    0   1   6    60    7


  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vs_country_batsman.rename(columns={'Grouping': 'Country'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vs_country_batsman.rename(columns={'Grouping': 'Country'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vs_country_batsman.rename(columns={'Grouping': 'Country'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying

      P_id         Country Inns  Runs    HS    Ave      SR
0   253802     v Australia   47  2367   123  53.79   94.00
1   253802    v Bangladesh   16   910   136  75.83  101.78
2   253802       v England   36  1340   122  41.87   88.15
3   253802   v New Zealand   31  1645  154*  58.75   95.69
4   253802      v Pakistan   16   678   183  52.15  100.29
..     ...             ...  ...   ...   ...    ...     ...
59   26421   v New Zealand    6    94    65  15.66  123.68
60   26421      v Pakistan    6    45   31*  11.25   61.64
61   26421  v South Africa    6    81   25*  20.25   83.50
62   26421     v Sri Lanka   11   132    38  16.50   94.28
63   26421   v West Indies    9   135    31  33.75   72.58

[64 rows x 7 columns]


  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_vs_away_batsman.rename(columns={'Grouping': 'Venue'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_vs_away_batsman.rename(columns={'Grouping': 'Venue'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_vs_away_batsman.rename(columns={'Grouping': 'Venue'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying

      P_id    Venue  Inns    Runs    HS    Ave      SR
0   253802     home   119    6268  166*  60.85   96.69
1   253802     away   112    5336  160*  55.58   90.87
2   253802  neutral    49    2244   183  60.64   91.85
3    34102     home    90    4745   264  57.86  103.28
4    34102     away   106    3546  171*  39.40   82.65
5    34102  neutral    58    2418   140  52.56   87.60
6   422108     home  33.0  1421.0   108  56.84   91.32
7   422108     away  26.0   848.0   112  38.54   85.91
8   422108  neutral  11.0   551.0  111*  61.22   82.48
9    28235     home  48.0  1890.0   143  41.08   93.75
10   28235     away  85.0  3141.0  132*  40.26   89.18
11   28235  neutral  31.0  1762.0   137  58.73   92.83
12   28081     home   113    4351  183*  53.71   91.60
13   28081     away   124    4520  101*  50.78   83.98
14   28081  neutral    60    1902  139*  44.23   87.60
15  234675     home    54    1028   61*  30.23   83.30
16  234675     away    52    1163    87  36.34   87.70
17  234675

  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
  tables = pd.read_html(str(soup))
A value is trying to

       P_id       Year Inns  Runs   HS    Ave      SR
0    253802  year 2008    5   159   54  31.80   66.52
1    253802  year 2009    8   325  107  54.16   84.41
2    253802  year 2010   24   995  118  47.38   85.11
3    253802  year 2011   34  1381  117  47.62   85.56
4    253802  year 2012   17  1026  183  68.40   93.78
..      ...        ...  ...   ...  ...    ...     ...
96    26421  year 2015    8    48  16*  16.00   56.47
97    26421  year 2016    1     1    1   1.00   50.00
98    26421  year 2017    3    17  15*   8.50  106.25
99    26421  year 2022    2    32  25*  32.00   86.48
100   26421  year 2023    -     -    -      -       -

[101 rows x 7 columns]


  tables = pd.read_html(str(soup))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yearly_stats_batsman.rename(columns={'Grouping': 'Year'}, inplace=True)
