In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import warnings
warnings.filterwarnings('ignore')

In [2]:
url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-cricket-world-cup-2023-24-15338'
response = requests.get(url)

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
table = soup.find('table')

In [5]:
team1 = []
team2 = []
winner = []
margin = []
ground = []
match_day = []
match_id = []

for rows in tqdm(table.find_all('tr'), desc = 'Process'):
    cells = rows.find_all('td')
    team1.append(cells[0].text)
    team2.append(cells[1].text)
    winner.append(cells[2].text)
    margin.append(cells[3].text)
    ground.append(cells[4].text)
    match_day.append(cells[5].text)
    match_id.append(cells[6].text)

Process: 100%|███████████████████████████████| 49/49 [00:00<00:00, 41319.04it/s]


In [6]:
col_index = 6
scorecard = []

for row in table.find_all('tr'):
    cells = row.find_all('td')
    
    cell = cells[col_index]
    link = cell.find('a')
    if link:
        scorecard.append('https://www.espncricinfo.com' + link.get('href'))
    else:
        pass

In [7]:
match_df = pd.DataFrame(data = [team1,team2,winner,margin,ground,match_day,match_id])
match_df = match_df.T
match_df.drop(0,axis=0, inplace = True)
match_df.columns = ['Team_1','Team_2', 'Winner', 'Margin', 'Ground','Match_day', 'Match_id']
match_df.reset_index(inplace = True, drop = True)
match_df.head()

Unnamed: 0,Team_1,Team_2,Winner,Margin,Ground,Match_day,Match_id
0,India,Australia,Australia,6 wickets,Ahmedabad,"Nov 19, 2023",ODI # 4705
1,Australia,South Africa,Australia,3 wickets,Eden Gardens,"Nov 16, 2023",ODI # 4704
2,India,New Zealand,India,70 runs,Wankhede,"Nov 15, 2023",ODI # 4703
3,India,Netherlands,India,160 runs,Bengaluru,"Nov 12, 2023",ODI # 4702
4,England,Pakistan,England,93 runs,Eden Gardens,"Nov 11, 2023",ODI # 4701


In [8]:
match_df['Match_day'] = pd.to_datetime(match_df['Match_day'])

In [9]:
match_df.dtypes

Team_1               object
Team_2               object
Winner               object
Margin               object
Ground               object
Match_day    datetime64[ns]
Match_id             object
dtype: object

## Batting Dataframe

In [10]:
def scrape_scorecard_and_match_details(url,table_index):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table
    table = soup.find_all('table')

    # Extract table headers (column names)
    match = soup.find_all('div', class_='ds-flex ds-items-center ds-min-w-0 ds-mr-1')
    team1 = match[-2].text.strip()
    team2 = match[-1].text.strip()
    match_details = team1 + ' vs ' + team2
    
    batting_postition = {team1 : 0, team2: 0}
    
    # Extract table rows and data
    data_rows = []
    for tr in table[table_index].find_all('tr'):
        row = [td.text.strip() for td in tr.find_all('td')]
        if row:
            row.append(match_details)
            if (table_index == 0 or table_index == 3):
                row.append(team1)
                batting_postition[team1] += 1
                row.append(batting_postition[team1])
            else:
                row.append(team2)
                batting_postition[team2] += 1
                row.append(batting_postition[team2])
            data_rows.append(row)
            
    df = pd.DataFrame(data_rows)
    return df

In [11]:
first_inn_df = pd.DataFrame()
for links in tqdm(scorecard, desc = 'Processing'):
    each_df = scrape_scorecard_and_match_details(links, 0)
    if each_df is not None and isinstance(each_df, pd.DataFrame):
        if first_inn_df.empty:
            first_inn_df = each_df
        else:
            first_inn_df = pd.merge(first_inn_df, each_df, how='outer')

Processing: 100%|███████████████████████████████| 48/48 [00:23<00:00,  2.08it/s]


In [12]:
first_inn_df.dropna(inplace = True)
first_inn_df.reset_index(inplace=True, drop=True)

In [13]:
first_inn_df[10] = first_inn_df[10].apply(lambda x: x+1 if x%2 == 0 else x)
first_inn_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Rohit Sharma (c),c Head b Maxwell,47,31,44,4,3,151.61,India vs Australia,India,1.0
1,Shubman Gill,c Zampa b Starc,4,7,21,0,0,57.14,India vs Australia,India,3.0
2,Virat Kohli,b Cummins,54,63,99,4,0,85.71,India vs Australia,India,5.0
3,Shreyas Iyer,c †Inglis b Cummins,4,3,3,1,0,133.33,India vs Australia,India,7.0
4,KL Rahul †,c †Inglis b Starc,66,107,133,1,0,61.68,India vs Australia,India,9.0
...,...,...,...,...,...,...,...,...,...,...,...
468,Liam Livingstone,c Henry b Boult,20,22,23,3,0,90.90,England vs New Zealand,England,13.0
469,Sam Curran,c †Latham b Henry,14,19,30,0,0,73.68,England vs New Zealand,England,15.0
470,Chris Woakes,c Young b Santner,11,12,15,1,0,91.66,England vs New Zealand,England,17.0
471,Adil Rashid,not out,15,13,23,0,1,115.38,England vs New Zealand,England,19.0


In [14]:
first_inn_df[10].replace(3,2, inplace=True)
first_inn_df[10].replace(5,3, inplace=True)
first_inn_df[10].replace(7,4, inplace=True)
first_inn_df[10].replace(9,5, inplace=True)
first_inn_df[10].replace(11,6, inplace=True)
first_inn_df[10].replace(13,7, inplace=True)
first_inn_df[10].replace(15,8, inplace=True)
first_inn_df[10].replace(17,9, inplace=True)
first_inn_df[10].replace(19,10, inplace=True)
first_inn_df[10].replace(21,11, inplace=True)

In [15]:
sec_inn_df = pd.DataFrame()
for links in tqdm(scorecard, desc = 'Processing'):
    each_df = scrape_scorecard_and_match_details(links,2)
    if each_df is not None and isinstance(each_df, pd.DataFrame):
        if sec_inn_df.empty:
            sec_inn_df = each_df
        else:
            sec_inn_df = pd.merge(sec_inn_df, each_df, how = 'outer')

Processing: 100%|███████████████████████████████| 48/48 [00:20<00:00,  2.30it/s]


In [16]:
sec_inn_df.dropna(inplace = True)
sec_inn_df.reset_index(inplace=True, drop=True)

In [17]:
sec_inn_df[10] = sec_inn_df[10].apply(lambda x: x+1 if x%2 == 0 else x)
sec_inn_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,David Warner,c Kohli b Mohammed Shami,7,3,6,1,0,233.33,India vs Australia,Australia,1.0
1,Travis Head,c Shubman Gill b Mohammed Siraj,137,120,166,15,4,114.16,India vs Australia,Australia,3.0
2,Mitchell Marsh,c †Rahul b Bumrah,15,15,15,1,1,100.00,India vs Australia,Australia,5.0
3,Steven Smith,lbw b Bumrah,4,9,11,1,0,44.44,India vs Australia,Australia,7.0
4,Marnus Labuschagne,not out,58,110,133,4,0,52.72,India vs Australia,Australia,9.0
...,...,...,...,...,...,...,...,...,...,...,...
398,Aryan Dutt,b Hasan Ali,1,2,6,0,0,50.00,Pakistan vs Netherlands,Netherlands,19.0
399,Paul van Meekeren,b Haris Rauf,7,12,17,1,0,58.33,Pakistan vs Netherlands,Netherlands,21.0
400,Devon Conway,not out,152,121,147,19,3,125.61,England vs New Zealand,New Zealand,1.0
401,Will Young,c †Buttler b Curran,0,1,5,0,0,0.00,England vs New Zealand,New Zealand,3.0


In [18]:
sec_inn_df[10].replace(3,2, inplace=True)
sec_inn_df[10].replace(5,3, inplace=True)
sec_inn_df[10].replace(7,4, inplace=True)
sec_inn_df[10].replace(9,5, inplace=True)
sec_inn_df[10].replace(11,6, inplace=True)
sec_inn_df[10].replace(13,7, inplace=True)
sec_inn_df[10].replace(15,8, inplace=True)
sec_inn_df[10].replace(17,9, inplace=True)
sec_inn_df[10].replace(19,10, inplace=True)
sec_inn_df[10].replace(21,11, inplace=True)

In [19]:
batting_df = pd.concat([first_inn_df,sec_inn_df], ignore_index=True)
batting_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Rohit Sharma (c),c Head b Maxwell,47,31,44,4,3,151.61,India vs Australia,India,1.0
1,Shubman Gill,c Zampa b Starc,4,7,21,0,0,57.14,India vs Australia,India,2.0
2,Virat Kohli,b Cummins,54,63,99,4,0,85.71,India vs Australia,India,3.0
3,Shreyas Iyer,c †Inglis b Cummins,4,3,3,1,0,133.33,India vs Australia,India,4.0
4,KL Rahul †,c †Inglis b Starc,66,107,133,1,0,61.68,India vs Australia,India,5.0


In [20]:
batting_df.columns = ['Batsman', 'Out/Not_out', 'Runs', 'Balls', 'Minuts_of_play', '4s', '6s', 'Strike_Rate','Match','Team_Innings','Batting_position']
batting_df.head()

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position
0,Rohit Sharma (c),c Head b Maxwell,47,31,44,4,3,151.61,India vs Australia,India,1.0
1,Shubman Gill,c Zampa b Starc,4,7,21,0,0,57.14,India vs Australia,India,2.0
2,Virat Kohli,b Cummins,54,63,99,4,0,85.71,India vs Australia,India,3.0
3,Shreyas Iyer,c †Inglis b Cummins,4,3,3,1,0,133.33,India vs Australia,India,4.0
4,KL Rahul †,c †Inglis b Starc,66,107,133,1,0,61.68,India vs Australia,India,5.0


In [21]:
batting_df.isna().sum()

Batsman             0
Out/Not_out         0
Runs                0
Balls               0
Minuts_of_play      0
4s                  0
6s                  0
Strike_Rate         0
Match               0
Team_Innings        0
Batting_position    0
dtype: int64

In [22]:
batting_df['Out/Not_out'].unique()

array(['c Head b Maxwell', 'c Zampa b Starc', 'b Cummins',
       'c †Inglis b Cummins', 'c †Inglis b Starc',
       'c †Inglis b Hazlewood', 'lbw b Zampa',
       'run out (Labuschagne/Cummins)', 'not out',
       'c Cummins b Hazlewood', 'c Smith b Hazlewood', 'c Warner b Starc',
       'b Head', 'c Head b Cummins', 'lbw b Head', 'c Smith b Starc',
       'c Maxwell b Cummins', 'c Williamson b Southee',
       'c Conway b Southee', 'c Mitchell b Boult', 'c Phillips b Southee',
       'c Barresi b de Leede', 'c Nidamanuru b van Meekeren',
       'b van der Merwe', 'c Engelbrecht b de Leede',
       'c †Mohammad Rizwan b Iftikhar Ahmed',
       'c Agha Salman b Haris Rauf',
       'c Shadab Khan b Shaheen Shah Afridi', 'b Shaheen Shah Afridi',
       'run out (Haris Rauf)', 'c Shaheen Shah Afridi b Haris Rauf',
       'b Haris Rauf', 'c Iftikhar Ahmed b Mohammad Wasim',
       'b Mohammad Wasim', 'c & b Abbott', 'c Labuschagne b Zampa',
       'run out (Labuschagne/†Inglis)', 'c Labusc

In [23]:
batting_df['Out/Not_out'] = batting_df['Out/Not_out'].apply(lambda x: x if x == 'not out' else 'out')
batting_df.head()

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position
0,Rohit Sharma (c),out,47,31,44,4,3,151.61,India vs Australia,India,1.0
1,Shubman Gill,out,4,7,21,0,0,57.14,India vs Australia,India,2.0
2,Virat Kohli,out,54,63,99,4,0,85.71,India vs Australia,India,3.0
3,Shreyas Iyer,out,4,3,3,1,0,133.33,India vs Australia,India,4.0
4,KL Rahul †,out,66,107,133,1,0,61.68,India vs Australia,India,5.0


In [24]:
batting_df['Out/Not_out'].unique()

array(['out', 'not out'], dtype=object)

In [25]:
batting_df[batting_df['Runs'] == '-']

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position
722,Reece Topley,out,-,-,-,-,-,-,South Africa vs England,England,11.0


In [26]:
batting_df[batting_df['Strike_Rate'] == '-']

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position
100,Angelo Mathews,out,0,0,4,0,0,-,Sri Lanka vs Bangladesh,Sri Lanka,6.0
211,Josh Hazlewood,not out,0,0,1,0,0,-,Australia vs New Zealand,Australia,11.0
589,Mark Wood,not out,0,0,1,0,0,-,Australia vs England,England,11.0
651,Shoriful Islam,not out,0,0,1,0,0,-,Netherlands vs Bangladesh,Bangladesh,11.0
722,Reece Topley,out,-,-,-,-,-,-,South Africa vs England,England,11.0


In [27]:
batting_df = batting_df[batting_df['Runs'] != '-']

In [28]:
batting_df['Strike_Rate'].replace('-',0, inplace=True)

In [29]:
batting_df[batting_df['Strike_Rate'] == '-']

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position


In [30]:
batting_df['Runs'] = pd.to_numeric(batting_df['Runs'])
batting_df['Balls'] = pd.to_numeric(batting_df['Balls'])
batting_df['Minuts_of_play'] = pd.to_numeric(batting_df['Minuts_of_play'])
batting_df['4s'] = pd.to_numeric(batting_df['4s'])
batting_df['6s'] = pd.to_numeric(batting_df['6s'])
batting_df['Strike_Rate'] = pd.to_numeric(batting_df['Strike_Rate'])

In [31]:
batting_df.dtypes

Batsman              object
Out/Not_out          object
Runs                  int64
Balls                 int64
Minuts_of_play        int64
4s                    int64
6s                    int64
Strike_Rate         float64
Match                object
Team_Innings         object
Batting_position    float64
dtype: object

In [32]:
batting_df.groupby('Batsman')['Runs'].sum().sort_values(ascending = False)

Batsman
Virat Kohli          765
Rohit Sharma (c)     597
Quinton de Kock †    594
Rachin Ravindra      578
Daryl Mitchell       552
                    ... 
Josh Hazlewood         4
Dushan Hemantha        4
Fazalhaq Farooqi       2
Usama Mir              0
Alex Carey †           0
Name: Runs, Length: 151, dtype: int64

In [33]:
clean_names = []
for i in batting_df['Batsman']:
    neat_name = i.replace('\xa0(c)','').replace('\xa0†','')
    neat_name
    clean_names.append(neat_name)
    
clean_names

['Rohit Sharma',
 'Shubman Gill',
 'Virat Kohli',
 'Shreyas Iyer',
 'KL Rahul',
 'Ravindra Jadeja',
 'Suryakumar Yadav',
 'Mohammed Shami',
 'Jasprit Bumrah',
 'Kuldeep Yadav',
 'Mohammed Siraj',
 'Quinton de Kock',
 'Temba Bavuma',
 'Rassie van der Dussen',
 'Aiden Markram',
 'Heinrich Klaasen',
 'David Miller',
 'Marco Jansen',
 'Gerald Coetzee',
 'Keshav Maharaj',
 'Kagiso Rabada',
 'Tabraiz Shamsi',
 'Rohit Sharma',
 'Shubman Gill',
 'Virat Kohli',
 'Shreyas Iyer',
 'KL Rahul',
 'Suryakumar Yadav',
 'Rohit Sharma',
 'Shubman Gill',
 'Virat Kohli',
 'Shreyas Iyer',
 'KL Rahul',
 'Suryakumar Yadav',
 'Dawid Malan',
 'Jonny Bairstow',
 'Joe Root',
 'Ben Stokes',
 'Jos Buttler†',
 'Harry Brook',
 'Moeen Ali',
 'Chris Woakes',
 'David Willey',
 'Gus Atkinson',
 'Adil Rashid',
 'Tanzid Hasan',
 'Litton Das',
 'Najmul Hossain Shanto',
 'Towhid Hridoy',
 'Mahmudullah',
 'Mushfiqur Rahim',
 'Mehidy Hasan Miraz',
 'Nasum Ahmed',
 'Mahedi Hasan',
 'Taskin Ahmed',
 'Rahmanullah Gurbaz',
 'Ibra

In [34]:
batting_df['Batsman'] = clean_names
batting_df.head()

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position
0,Rohit Sharma,out,47,31,44,4,3,151.61,India vs Australia,India,1.0
1,Shubman Gill,out,4,7,21,0,0,57.14,India vs Australia,India,2.0
2,Virat Kohli,out,54,63,99,4,0,85.71,India vs Australia,India,3.0
3,Shreyas Iyer,out,4,3,3,1,0,133.33,India vs Australia,India,4.0
4,KL Rahul,out,66,107,133,1,0,61.68,India vs Australia,India,5.0


## Bowling Dataframe

In [35]:
first_inn_bow = pd.DataFrame()
for link in tqdm(scorecard, desc = 'Processing'):
    each_df = scrape_scorecard_and_match_details(link, 1)
    if each_df is not None and isinstance(each_df, pd.DataFrame):
        if first_inn_bow.empty:
            first_inn_bow = each_df
        else:
            first_inn_bow = pd.merge(first_inn_bow, each_df, how = 'outer')

Processing: 100%|███████████████████████████████| 48/48 [00:21<00:00,  2.23it/s]


In [36]:
first_inn_bow.dropna(inplace = True)
first_inn_bow.reset_index(inplace=True, drop = True)
first_inn_bow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,India vs Australia,Australia,1.0
1,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,India vs Australia,Australia,3.0
2,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,India vs Australia,Australia,5.0
3,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,India vs Australia,Australia,7.0
4,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,India vs Australia,Australia,9.0


In [37]:
sec_inn_bow = pd.DataFrame()
for link in tqdm(scorecard, desc = 'Processing'):
    each_df = scrape_scorecard_and_match_details(link, 3)
    if each_df is not None and isinstance(each_df, pd.DataFrame):
        if sec_inn_bow.empty:
            sec_inn_bow = each_df
        else:
            sec_inn_bow = pd.merge(sec_inn_bow, each_df, how = 'outer')

Processing: 100%|███████████████████████████████| 48/48 [00:22<00:00,  2.17it/s]


In [38]:
sec_inn_bow.dropna(inplace = True)
sec_inn_bow.reset_index(inplace=True, drop = True)
sec_inn_bow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Jasprit Bumrah,9,2,43,2,4.77,37,8,0,0,0,India vs Australia,India,1.0
1,Mohammed Shami,7,1,47,1,6.71,28,6,1,3,0,India vs Australia,India,3.0
2,Ravindra Jadeja,10,0,43,0,4.3,29,1,1,1,0,India vs Australia,India,5.0
3,Kuldeep Yadav,10,0,56,0,5.6,25,3,2,0,0,India vs Australia,India,6.0
4,Mohammed Siraj,7,0,45,1,6.42,16,4,1,0,0,India vs Australia,India,7.0


In [39]:
bowling_df = pd.concat([first_inn_bow, sec_inn_bow], ignore_index=True)
bowling_df.reset_index(inplace = True, drop = True)
bowling_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,India vs Australia,Australia,1.0
1,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,India vs Australia,Australia,3.0
2,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,India vs Australia,Australia,5.0
3,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,India vs Australia,Australia,7.0
4,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,India vs Australia,Australia,9.0


In [40]:
bowling_df.shape

(574, 14)

In [41]:
bowling_df.dtypes

0      object
1      object
2      object
3      object
4      object
5      object
6      object
7      object
8      object
9      object
10     object
11     object
12     object
13    float64
dtype: object

In [42]:
bowling_df.drop(13, axis = 1, inplace = True)
bowling_df.columns = ['Bowling', 'Overs', 'Maiden', 'Runs', 'Wickets', 
                      'Economy', 'Dots', '4s', '6s', 'Wide', 'No_balls',
                      'Match', 'Team_Innings']
bowling_df.head()

Unnamed: 0,Bowling,Overs,Maiden,Runs,Wickets,Economy,Dots,4s,6s,Wide,No_balls,Match,Team_Innings
0,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,India vs Australia,Australia
1,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,India vs Australia,Australia
2,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,India vs Australia,Australia
3,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,India vs Australia,Australia
4,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,India vs Australia,Australia


In [43]:
bowling_df.Overs.unique()

array(['10', '6', '2', '8', '9.4', '7', '5', '3', '4', '8.3', '4.3',
       '7.4', '1', '9', '9.3', '8.2', '5.3', '8.1', '6.3', '6.2', '8.5',
       '6.1', '0.3', '9.5', '4.2', '0.5', '7.1', '5.1', '9.1', '0.4',
       '9.2', '6.5', '7.2', '4.4', '8.4', '7.3', '7.5', '5.4'],
      dtype=object)

In [44]:
bowling_df['Overs'] = pd.to_numeric(bowling_df['Overs'])
bowling_df['Maiden'] = pd.to_numeric(bowling_df['Maiden'])
bowling_df['Runs'] = pd.to_numeric(bowling_df['Runs'])
bowling_df['Wickets'] = pd.to_numeric(bowling_df['Wickets'])
bowling_df['Economy'] = pd.to_numeric(bowling_df['Economy'])
bowling_df['Dots'] = pd.to_numeric(bowling_df['Dots'])
bowling_df['4s'] = pd.to_numeric(bowling_df['4s'])
bowling_df['6s'] = pd.to_numeric(bowling_df['6s'])
bowling_df['Wide'] = pd.to_numeric(bowling_df['Wide'])
bowling_df['No_balls'] = pd.to_numeric(bowling_df['No_balls'])

In [45]:
bowling_df.dtypes

Bowling          object
Overs           float64
Maiden            int64
Runs              int64
Wickets           int64
Economy         float64
Dots              int64
4s                int64
6s                int64
Wide              int64
No_balls          int64
Match            object
Team_Innings     object
dtype: object

## Players data frame

In [46]:
player_url = 'https://www.espncricinfo.com/records/tournament/averages-batting/icc-cricket-world-cup-2023-24-15338'
player_request = requests.get(player_url)

In [47]:
if player_request.status_code == 200:
    player_soup = BeautifulSoup(player_request.content.decode('utf-8'), 'html.parser')
else:
    print("Failed to fetch player data.")

In [48]:
player_tables = player_soup.find('table')

In [49]:
player_link = []
for rows in tqdm(player_tables.find_all('tr'), desc = 'Process'):
    cells = rows.find_all('td')
    cell = cells[0]
    link = cell.find('a')
    if link:
        player_link.append('https://www.espncricinfo.com' + link.get('href'))
    else:
        pass

Process: 100%|████████████████████████████| 303/303 [00:00<00:00, 126631.54it/s]


In [50]:
len(player_link)

151

In [51]:
player_link[0:5]

['https://www.espncricinfo.com/cricketers/sean-abbott-398666',
 'https://www.espncricinfo.com/cricketers/abdullah-shafique-922941',
 'https://www.espncricinfo.com/cricketers/colin-ackermann-379774',
 'https://www.espncricinfo.com/cricketers/agha-salman-623977',
 'https://www.espncricinfo.com/cricketers/moeen-ali-8917']

In [52]:
Name = [] 
Country = [] 
Batting = [] 
Bowling = [] 
Role = []

In [53]:
for links in tqdm(player_link, desc="Processing URLs", unit="links"):
    request = requests.get(links)
    soup = BeautifulSoup(request.content, 'html.parser')
    name_tag = soup.find('h1', class_ = 'ds-text-title-l ds-font-bold')
    p_tag = soup.find_all('p', class_ = 'ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3')
    most = soup.find_all('span', class_ = 'ds-text-title-s ds-font-bold ds-text-typo')
    last = soup.find_all('span', class_ = 'ds-cursor-pointer ds-inline-flex ds-items-start ds-leading-none')
   
    batting, bowling, role = '', '', ''
    
    for i,title in enumerate(p_tag):
        if title.text == 'Batting Style':
            batting_index = i
        elif title.text == 'Bowling Style':
            bowling_index = i
        elif title.text == 'Playing Role':
            role_index = i
    
    try:
        name = name_tag.text
        country = last[0].text
        batting = most[batting_index].text
        bowling = most[bowling_index].text
        role = most[role_index].text
        
    except IndexError:
        role = ''
    
    Name.append(name)
    Country.append(country)
    Batting.append(batting)
    Bowling.append(bowling)
    Role.append(role)

Processing URLs: 100%|█████████████████████| 151/151 [01:50<00:00,  1.37links/s]


In [54]:
player_df = pd.DataFrame(data = [Name,Country,Batting,Bowling,Role]).T
player_df.columns = ['Name', 'Country', 'Batting', 'Bowling', 'Role']

In [55]:
player_df.head()

Unnamed: 0,Name,Country,Batting,Bowling,Role
0,Sean Abbott,Australia,Right hand Bat,Right arm Fast medium,Bowling Allrounder
1,Abdullah Shafique,Pakistan,Right hand Bat,Right arm Offbreak,Top order Batter
2,Colin Ackermann,Netherlands,Right hand Bat,Right arm Offbreak,Batting Allrounder
3,Agha Salman,Pakistan,Right hand Bat,Right arm Offbreak,Allrounder
4,Moeen Ali,England,Left hand Bat,Right arm Offbreak,Batting Allrounder


In [56]:
player_df[player_df['Name'] == 'Rohit Sharma']

Unnamed: 0,Name,Country,Batting,Bowling,Role
119,Rohit Sharma,India,Right hand Bat,Right arm Offbreak,Top order Batter


In [57]:
player_df.shape    

(151, 5)

In [58]:
player_df.isnull().sum()

Name       0
Country    0
Batting    0
Bowling    0
Role       0
dtype: int64

In [59]:
player_df['Role'].unique()

array(['Bowling Allrounder', 'Top order Batter', 'Batting Allrounder',
       'Allrounder', 'Bowler', 'Batter', 'Wicketkeeper Batter',
       'Middle order Batter', 'Opening Batter'], dtype=object)

### creating connection

In [60]:
match_df.head()

Unnamed: 0,Team_1,Team_2,Winner,Margin,Ground,Match_day,Match_id
0,India,Australia,Australia,6 wickets,Ahmedabad,2023-11-19,ODI # 4705
1,Australia,South Africa,Australia,3 wickets,Eden Gardens,2023-11-16,ODI # 4704
2,India,New Zealand,India,70 runs,Wankhede,2023-11-15,ODI # 4703
3,India,Netherlands,India,160 runs,Bengaluru,2023-11-12,ODI # 4702
4,England,Pakistan,England,93 runs,Eden Gardens,2023-11-11,ODI # 4701


In [61]:
match_dict = {}

for index, row in match_df.iterrows():
    key1 = row['Team_1'] + ' vs ' + row['Team_2']
    key2 = row['Team_2'] + ' vs ' + row['Team_1']
    
    match_dict[key1] = row['Match_id']
    match_dict[key2] = row['Match_id']

match_dict

{'India vs Australia': 'ODI # 4662',
 'Australia vs India': 'ODI # 4662',
 'Australia vs South Africa': 'ODI # 4667',
 'South Africa vs Australia': 'ODI # 4667',
 'India vs New Zealand': 'ODI # 4678',
 'New Zealand vs India': 'ODI # 4678',
 'India vs Netherlands': 'ODI # 4702',
 'Netherlands vs India': 'ODI # 4702',
 'England vs Pakistan': 'ODI # 4701',
 'Pakistan vs England': 'ODI # 4701',
 'Australia vs Bangladesh': 'ODI # 4700',
 'Bangladesh vs Australia': 'ODI # 4700',
 'Afghanistan vs South Africa': 'ODI # 4699',
 'South Africa vs Afghanistan': 'ODI # 4699',
 'New Zealand vs Sri Lanka': 'ODI # 4698',
 'Sri Lanka vs New Zealand': 'ODI # 4698',
 'England vs Netherlands': 'ODI # 4697',
 'Netherlands vs England': 'ODI # 4697',
 'Afghanistan vs Australia': 'ODI # 4696',
 'Australia vs Afghanistan': 'ODI # 4696',
 'Bangladesh vs Sri Lanka': 'ODI # 4695',
 'Sri Lanka vs Bangladesh': 'ODI # 4695',
 'India vs South Africa': 'ODI # 4694',
 'South Africa vs India': 'ODI # 4694',
 'Australia 

### Batting

In [62]:
batting_df['Match_id'] = batting_df['Match'].map(match_dict)
batting_df.head()

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position,Match_id
0,Rohit Sharma,out,47,31,44,4,3,151.61,India vs Australia,India,1.0,ODI # 4662
1,Shubman Gill,out,4,7,21,0,0,57.14,India vs Australia,India,2.0,ODI # 4662
2,Virat Kohli,out,54,63,99,4,0,85.71,India vs Australia,India,3.0,ODI # 4662
3,Shreyas Iyer,out,4,3,3,1,0,133.33,India vs Australia,India,4.0,ODI # 4662
4,KL Rahul,out,66,107,133,1,0,61.68,India vs Australia,India,5.0,ODI # 4662


In [63]:
batting_df.head(30)

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position,Match_id
0,Rohit Sharma,out,47,31,44,4,3,151.61,India vs Australia,India,1.0,ODI # 4662
1,Shubman Gill,out,4,7,21,0,0,57.14,India vs Australia,India,2.0,ODI # 4662
2,Virat Kohli,out,54,63,99,4,0,85.71,India vs Australia,India,3.0,ODI # 4662
3,Shreyas Iyer,out,4,3,3,1,0,133.33,India vs Australia,India,4.0,ODI # 4662
4,KL Rahul,out,66,107,133,1,0,61.68,India vs Australia,India,5.0,ODI # 4662
5,Ravindra Jadeja,out,9,22,33,0,0,40.9,India vs Australia,India,6.0,ODI # 4662
6,Suryakumar Yadav,out,18,28,57,1,0,64.28,India vs Australia,India,7.0,ODI # 4662
7,Mohammed Shami,out,6,10,9,1,0,60.0,India vs Australia,India,8.0,ODI # 4662
8,Jasprit Bumrah,out,1,3,5,0,0,33.33,India vs Australia,India,9.0,ODI # 4662
9,Kuldeep Yadav,out,10,18,28,0,0,55.55,India vs Australia,India,10.0,ODI # 4662


In [64]:
batting_df[(batting_df['Match'] == 'India vs Australia')]

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position,Match_id
0,Rohit Sharma,out,47,31,44,4,3,151.61,India vs Australia,India,1.0,ODI # 4662
1,Shubman Gill,out,4,7,21,0,0,57.14,India vs Australia,India,2.0,ODI # 4662
2,Virat Kohli,out,54,63,99,4,0,85.71,India vs Australia,India,3.0,ODI # 4662
3,Shreyas Iyer,out,4,3,3,1,0,133.33,India vs Australia,India,4.0,ODI # 4662
4,KL Rahul,out,66,107,133,1,0,61.68,India vs Australia,India,5.0,ODI # 4662
5,Ravindra Jadeja,out,9,22,33,0,0,40.9,India vs Australia,India,6.0,ODI # 4662
6,Suryakumar Yadav,out,18,28,57,1,0,64.28,India vs Australia,India,7.0,ODI # 4662
7,Mohammed Shami,out,6,10,9,1,0,60.0,India vs Australia,India,8.0,ODI # 4662
8,Jasprit Bumrah,out,1,3,5,0,0,33.33,India vs Australia,India,9.0,ODI # 4662
9,Kuldeep Yadav,out,10,18,28,0,0,55.55,India vs Australia,India,10.0,ODI # 4662


In [65]:
batting_df['Match_id'].iloc[0:11] = 'ODI # 4705'
batting_df['Match_id'].iloc[473:479] = 'ODI # 4705'

In [66]:
batting_df[(batting_df['Match'] == 'South Africa vs Australia')]

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position,Match_id
11,Quinton de Kock,out,3,14,27,0,0,21.42,South Africa vs Australia,South Africa,1.0,ODI # 4667
12,Temba Bavuma,out,0,4,5,0,0,0.0,South Africa vs Australia,South Africa,2.0,ODI # 4667
13,Rassie van der Dussen,out,6,31,53,0,0,19.35,South Africa vs Australia,South Africa,3.0,ODI # 4667
14,Aiden Markram,out,10,20,24,2,0,50.0,South Africa vs Australia,South Africa,4.0,ODI # 4667
15,Heinrich Klaasen,out,47,48,76,4,2,97.91,South Africa vs Australia,South Africa,5.0,ODI # 4667
16,David Miller,out,101,116,138,8,5,87.06,South Africa vs Australia,South Africa,6.0,ODI # 4667
17,Marco Jansen,out,0,1,1,0,0,0.0,South Africa vs Australia,South Africa,7.0,ODI # 4667
18,Gerald Coetzee,out,19,39,42,2,0,48.71,South Africa vs Australia,South Africa,8.0,ODI # 4667
19,Keshav Maharaj,out,4,8,13,0,0,50.0,South Africa vs Australia,South Africa,9.0,ODI # 4667
20,Kagiso Rabada,out,10,12,18,0,1,83.33,South Africa vs Australia,South Africa,10.0,ODI # 4667


In [67]:
batting_df['Match_id'].iloc[11:22] = 'ODI # 4704'
batting_df['Match_id'].iloc[479:488] = 'ODI # 4704'

In [68]:
batting_df[(batting_df['Match'] == 'India vs New Zealand')]

Unnamed: 0,Batsman,Out/Not_out,Runs,Balls,Minuts_of_play,4s,6s,Strike_Rate,Match,Team_Innings,Batting_position,Match_id
22,Rohit Sharma,out,47,29,40,4,4,162.06,India vs New Zealand,India,1.0,ODI # 4678
23,Shubman Gill,not out,80,66,116,8,3,121.21,India vs New Zealand,India,2.0,ODI # 4678
24,Virat Kohli,out,117,113,149,9,2,103.53,India vs New Zealand,India,3.0,ODI # 4678
25,Shreyas Iyer,out,105,70,114,4,8,150.0,India vs New Zealand,India,4.0,ODI # 4678
26,KL Rahul,not out,39,20,29,5,2,195.0,India vs New Zealand,India,5.0,ODI # 4678
27,Suryakumar Yadav,out,1,2,2,0,0,50.0,India vs New Zealand,India,5.0,ODI # 4678
488,Devon Conway,out,13,15,26,3,0,86.66,India vs New Zealand,New Zealand,1.0,ODI # 4678
489,Rachin Ravindra,out,13,22,39,3,0,59.09,India vs New Zealand,New Zealand,2.0,ODI # 4678
490,Kane Williamson,out,69,73,116,8,1,94.52,India vs New Zealand,New Zealand,3.0,ODI # 4678
491,Daryl Mitchell,out,134,119,180,9,7,112.6,India vs New Zealand,New Zealand,4.0,ODI # 4678


In [69]:
batting_df['Match_id'].iloc[22:28] = 'ODI # 4703'
batting_df['Match_id'].iloc[488:499] = 'ODI # 4703'

### Bowling

In [70]:
bowling_df['Match_id'] = bowling_df['Match'].map(match_dict)
bowling_df.head()

Unnamed: 0,Bowling,Overs,Maiden,Runs,Wickets,Economy,Dots,4s,6s,Wide,No_balls,Match,Team_Innings,Match_id
0,Mitchell Starc,10.0,0,55,3,5.5,30,4,1,3,0,India vs Australia,Australia,ODI # 4662
1,Josh Hazlewood,10.0,0,60,2,6.0,22,4,1,1,0,India vs Australia,Australia,ODI # 4662
2,Glenn Maxwell,6.0,0,35,1,5.83,19,4,1,0,0,India vs Australia,Australia,ODI # 4662
3,Pat Cummins,10.0,0,34,2,3.4,30,0,0,2,0,India vs Australia,Australia,ODI # 4662
4,Adam Zampa,10.0,0,44,1,4.4,22,1,0,1,0,India vs Australia,Australia,ODI # 4662


In [71]:
bowling_df[(bowling_df['Match'] == 'India vs Australia')]

Unnamed: 0,Bowling,Overs,Maiden,Runs,Wickets,Economy,Dots,4s,6s,Wide,No_balls,Match,Team_Innings,Match_id
0,Mitchell Starc,10.0,0,55,3,5.5,30,4,1,3,0,India vs Australia,Australia,ODI # 4662
1,Josh Hazlewood,10.0,0,60,2,6.0,22,4,1,1,0,India vs Australia,Australia,ODI # 4662
2,Glenn Maxwell,6.0,0,35,1,5.83,19,4,1,0,0,India vs Australia,Australia,ODI # 4662
3,Pat Cummins,10.0,0,34,2,3.4,30,0,0,2,0,India vs Australia,Australia,ODI # 4662
4,Adam Zampa,10.0,0,44,1,4.4,22,1,0,1,0,India vs Australia,Australia,ODI # 4662
5,Mitchell Marsh,2.0,0,5,0,2.5,7,0,0,0,0,India vs Australia,Australia,ODI # 4662
6,Travis Head,2.0,0,4,0,2.0,8,0,0,0,0,India vs Australia,Australia,ODI # 4662
298,Jasprit Bumrah,9.0,2,43,2,4.77,37,8,0,0,0,India vs Australia,India,ODI # 4662
299,Mohammed Shami,7.0,1,47,1,6.71,28,6,1,3,0,India vs Australia,India,ODI # 4662
300,Ravindra Jadeja,10.0,0,43,0,4.3,29,1,1,1,0,India vs Australia,India,ODI # 4662


In [72]:
bowling_df['Match_id'].iloc[0:7] = 'ODI # 4705'
bowling_df['Match_id'].iloc[298:303] = 'ODI # 4705'

In [73]:
bowling_df[(bowling_df['Match'] == 'South Africa vs Australia')]

Unnamed: 0,Bowling,Overs,Maiden,Runs,Wickets,Economy,Dots,4s,6s,Wide,No_balls,Match,Team_Innings,Match_id
7,Mitchell Starc,10.0,1,34,3,3.4,46,3,1,4,0,South Africa vs Australia,Australia,ODI # 4667
8,Josh Hazlewood,8.0,3,12,2,1.5,39,1,0,0,0,South Africa vs Australia,Australia,ODI # 4667
9,Pat Cummins,9.4,0,51,3,5.27,32,5,1,2,0,South Africa vs Australia,Australia,ODI # 4667
10,Adam Zampa,7.0,0,55,0,7.85,21,1,6,0,0,South Africa vs Australia,Australia,ODI # 4667
11,Glenn Maxwell,10.0,0,35,0,3.5,35,3,0,0,0,South Africa vs Australia,Australia,ODI # 4667
12,Travis Head,5.0,0,21,2,4.2,19,3,0,1,0,South Africa vs Australia,Australia,ODI # 4667
234,Mitchell Starc,9.0,1,53,2,5.88,29,3,3,1,0,South Africa vs Australia,Australia,ODI # 4667
235,Josh Hazlewood,9.0,0,60,1,6.66,30,7,2,3,0,South Africa vs Australia,Australia,ODI # 4667
236,Glenn Maxwell,10.0,1,34,2,3.4,32,0,0,2,0,South Africa vs Australia,Australia,ODI # 4667
237,Pat Cummins,9.0,0,71,1,7.88,19,6,3,0,0,South Africa vs Australia,Australia,ODI # 4667


In [74]:
bowling_df['Match_id'].iloc[7:13] = 'ODI # 4704'
bowling_df['Match_id'].iloc[303:309] = 'ODI # 4704'

In [75]:
bowling_df[(bowling_df['Match'] == 'India vs New Zealand')]

Unnamed: 0,Bowling,Overs,Maiden,Runs,Wickets,Economy,Dots,4s,6s,Wide,No_balls,Match,Team_Innings,Match_id
13,Trent Boult,10.0,0,86,1,8.6,26,9,4,2,0,India vs New Zealand,New Zealand,ODI # 4678
14,Tim Southee,10.0,0,100,3,10.0,20,8,6,1,0,India vs New Zealand,New Zealand,ODI # 4678
15,Mitchell Santner,10.0,1,51,0,5.1,29,3,2,0,0,India vs New Zealand,New Zealand,ODI # 4678
16,Lockie Ferguson,8.0,0,65,0,8.12,15,7,1,1,0,India vs New Zealand,New Zealand,ODI # 4678
17,Rachin Ravindra,7.0,0,60,0,8.57,13,2,4,2,0,India vs New Zealand,New Zealand,ODI # 4678
18,Glenn Phillips,5.0,0,33,0,6.6,11,1,2,0,0,India vs New Zealand,New Zealand,ODI # 4678
309,Jasprit Bumrah,10.0,1,64,1,6.4,32,6,1,6,0,India vs New Zealand,India,ODI # 4678
310,Mohammed Siraj,9.0,0,78,1,8.66,26,10,3,3,0,India vs New Zealand,India,ODI # 4678
311,Mohammed Shami,9.5,0,57,7,5.79,37,3,4,2,0,India vs New Zealand,India,ODI # 4678
312,Ravindra Jadeja,10.0,0,63,0,6.3,29,3,3,0,1,India vs New Zealand,India,ODI # 4678


In [76]:
bowling_df['Match_id'].iloc[13:19] = 'ODI # 4703'
bowling_df['Match_id'].iloc[309:314] = 'ODI # 4703'

### Downloading the Data

In [77]:
# match_df.to_csv('/Users/sachinkumar/Documents/Projects/Worldcup23/Data/match.csv')
# batting_df.to_csv('/Users/sachinkumar/Documents/Projects/Worldcup23/Data/batting.csv')
# bowling_df.to_csv('/Users/sachinkumar/Documents/Projects/Worldcup23/Data/bowling.csv')
# player_df.to_csv('/Users/sachinkumar/Documents/Projects/Worldcup23/Data/player.csv')