In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# Create an URL object
url = "https://www.espncricinfo.com/records/tournament/team-match-results/icc-men-s-t20-world-cup-2022-23-14450"
# Create object page
page = requests.get(url)

In [3]:
# parser-lxml = Change html to Python friendly format
# Obtain page's information
soup = BeautifulSoup(page.text, 'lxml')

## DataFrame to be created
    --> 1st page
      #1. t20_wc_match_results

    --> Link in 1st page(score card) leads to the following two:(2nd page)
      #2. t20_wc_batting_summary
      #3. t20_wc_bowling_summary
    
    --> Link(Name) in the 2nd page leads to the following:
      #4. t20_wc_player_info



#1. t20_wc_match_results

In [4]:
headers = soup.table.thead.find_all('span')
match_results_headers = [header.text for header in headers]


In [5]:
table_data = soup.table.tbody.find_all('td')
match_results_data = [data.text for data in table_data]

In [6]:
cols = len(match_results_headers)
# rows as many as available,so '-1'
rows = -1

match_data_arr = np.array(match_results_data).reshape(rows,cols)

In [7]:
t20_wc_match_results = pd.DataFrame(data=match_data_arr, columns=match_results_headers)
t20_wc_match_results.to_csv('t20_wc_match_results.csv', index=False)

In [8]:
t20_wc_match_results

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830
5,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong,"Oct 18, 2022",T20I # 1832
6,Ireland,Scotland,Ireland,6 wickets,Hobart,"Oct 19, 2022",T20I # 1833
7,West Indies,Zimbabwe,West Indies,31 runs,Hobart,"Oct 19, 2022",T20I # 1834
8,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong,"Oct 20, 2022",T20I # 1835
9,Namibia,U.A.E.,U.A.E.,7 runs,Geelong,"Oct 20, 2022",T20I # 1836


#2. t20_wc_batting_summary,
#3. t20_wc_bowling_summary and
#4. t20_wc_player_info.


In [9]:
def match_info(bs4_obj):
  '''
  returns --> team1, team2 and match_vs
  '''
  teams = bs4_obj.find_all('span',class_ = "ds-text-tight-l ds-font-bold ds-text-typo hover:ds-text-typo-primary ds-block ds-truncate")
  team1 = teams[0].text
  team2 = teams[1].text
  match_teams = team1 +  ' Vs ' + team2

  return team1,team2,match_teams


In [10]:
def collecting_row_wise_data(bs4_obj):
  '''
    gathering the necessary data from the url.
    .................................................................
    returns--> first_inning_rows_batting, second_inning_rows_batting,
                first_inning_rows_bowling, second_inning_rows_bowling
  '''
  # getting batting and bowling table only.
  batting_table = bs4_obj.find_all('table', class_= "ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table")
  bowling_table =  bs4_obj.find_all('table', class_="ds-w-full ds-table ds-table-md ds-table-auto")

  # extracting  row wise batting data from table:
  first_inning_rows_batting = batting_table[0].tbody.find_all('tr',class_='')
  second_inning_rows_batting = batting_table[1].tbody.find_all('tr',class_='')

  # extracting  row wise bowling data from table:
  first_inning_rows_bowling = bowling_table[0].tbody.find_all('tr',class_='')
  second_inning_rows_bowling= bowling_table[1].tbody.find_all('tr',class_='')


  return first_inning_rows_batting, second_inning_rows_batting, first_inning_rows_bowling, second_inning_rows_bowling

In [11]:
def  generate_batting_data(row_data,match_id, match_teams, playing_team):
  '''
  row_data = first_inning_rows_batting and second_inning_rows_batting.
  match_id = match id from  match_summary.
  match_teams =  which teams plays.
  playing_team = current batting team

  '''
  batting_summary = []
  player_link = []

  for index,row in enumerate(row_data):
    tds = row.find_all('td')
    if len(tds) >= 8:
      link = tds[0].find('a')['href']
      summary_data = {
        'match_id': match_id,
        "match": match_teams,
        "team": playing_team,
        "batting_pos": index+1,
        "batsman_name": tds[0].text,
        "dismissal": tds[1].text,
        "runs": tds[2].text,
        "balls": tds[3].text,
        "4s": tds[5].text,
        "6s": tds[6].text,
      "SR": tds[7].text
      }

      batting_summary.append(summary_data)
      player_link.append(link)

  return batting_summary,player_link

  # pd.DataFrame(batting_data)

In [12]:
def  generate_bowling_data(row_data,match_id, match_teams, playing_team):
  '''
  row_data = first_inning_rows_bowling and second_inning_rows_bowling.
  match_id = match id from  match_summary.
  match_teams =  which teams plays.
  playing_team = current bowling team

  '''
  bowling_summary = []
  player_link = []

  for index,row in enumerate(row_data):
    tds = row.find_all('td')
    if len(tds) >= 11:
      link = tds[0].find('a')['href']
      summary_data = {'match_id': match_id,
                      "match": match_teams,
                      "team": playing_team,
                      "bowler_name": tds[0].text,
                      "overs": tds[1].text,
                      "maiden": tds[2].text,
                      "runs": tds[3].text,
                      "wickets": tds[4].text,
                      "economy": tds[5].text,
                    "0s": tds[6].text,
                      "4s": tds[7].text,
                      "6s": tds[8].text,
                      "wides": tds[9].text,
                      "no_balls": tds[10].text
                    }


      bowling_summary.append(summary_data)
      player_link.append(link)

  return bowling_summary,player_link


In [13]:
def player_info_generator(bs4_obj):

  details = bs4_obj.find('div',class_ ='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8')
  detail = {detail.p.text : detail.span.text for detail in details}

  description = bs4_obj.find('div',class_='ci-player-bio-content')
  if description:
    description = description.p.text

  team = bs4_obj.find('span', class_="ds-text-comfortable-s").text

  img = {"image":soup.find('div',class_='ds-bg-cover ds-bg-center')['style'][21:-1]}

  # sometimes,name in detail doesn't match batting/bowling summary name ,
  # hence use title name which is same as batting/bowling summary name
  name = soup.find('h1', class_="ds-text-title-l ds-font-bold").text

  player_details = {
                  "name": name,
                  "image": img.get("image",None),
                  "team": team,
                  "batting_style": detail.get('Batting Style', None),
                  "bowling_style": detail.get('Bowling Style', None),
                  "playing_role": detail.get('Playing Role', None),
                  "description": description,
                  }

  return player_details

In [14]:
# ----------------------------------------------------------batting/bowling summary------------------------------------------------------------------------

batting_summary_data = []
bowling_summary_data = []
player_links=[]


# every 7th element is the link to the batting and bowling summary
for i in range(6,len(table_data), 7):

  url = "https://www.espncricinfo.com"+ table_data[i].a['href']

  match_id = table_data[i].text

  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'lxml')

  # get the match info:
  team1, team2, match_teams = match_info(bs4_obj = soup)

  #get row wise data and player links:
  first_inning_rows_batting, second_inning_rows_batting, \
  first_inning_rows_bowling,second_inning_rows_bowling = collecting_row_wise_data(bs4_obj = soup)

 #get batting and bowling summary of match:
  batting_summary_team1,player_link_bat_team1 = generate_batting_data(first_inning_rows_batting, match_id=match_id, match_teams=match_teams,
                                                                  playing_team = team1)
  batting_summary_team2,player_link_bat_team2 = generate_batting_data(second_inning_rows_batting, match_id=match_id, match_teams=match_teams,
                                                                  playing_team = team2)
  bowling_summary_team1,player_link_bowl_team1= generate_bowling_data(first_inning_rows_bowling, match_id=match_id, match_teams=match_teams,
                                                                  playing_team = team1)
  bowling_summary_team2,player_link_bowl_team2= generate_bowling_data(second_inning_rows_bowling, match_id=match_id, match_teams=match_teams,
                                                                  playing_team = team1)

  # store the collected data in the respective  list object:
  batting_summary_data.extend(batting_summary_team1 + batting_summary_team2)
  bowling_summary_data.extend(bowling_summary_team1 + bowling_summary_team2)

  player_link = player_link_bat_team1 + player_link_bat_team2 + player_link_bowl_team1 + player_link_bowl_team2
  player_links.extend(player_link)

# To avoid dulpicate links:
player_links = list(set(player_links))


#create data frame objects:
t20_wc_batting_summary = pd.DataFrame(data=batting_summary_data)
t20_wc_bowling_summary = pd.DataFrame(data=bowling_summary_data)



In [15]:
# ----------------------------------------------------------player detail------------------------------------------------------------------------
player_data = []

for link in player_links:
  url = "https://www.espncricinfo.com"+ link

  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'lxml')

  player_details =  player_info_generator(bs4_obj = soup)

  player_data.append(player_details)


t20_wc_player_info = pd.DataFrame(data=player_data)


In [16]:
t20_wc_batting_summary.to_csv("t20_wc_batting_summary.csv", index=False)

In [17]:
t20_wc_bowling_summary.to_csv("t20_wc_bowling_summary.csv", index=False)

In [18]:
t20_wc_player_info.to_csv("t20_wc_player_info.csv", index=False)

In [19]:
t20_wc_batting_summary


Unnamed: 0,match_id,match,team,batting_pos,batsman_name,dismissal,runs,balls,4s,6s,SR
0,T20I # 1823,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,c Pramod Madushan b Chameera,3,6,0,0,50.00
1,T20I # 1823,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,c Shanaka b Pramod Madushan,9,9,1,0,100.00
2,T20I # 1823,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,c †Mendis b Karunaratne,20,12,1,2,166.66
3,T20I # 1823,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,c DM de Silva b Pramod Madushan,26,24,2,0,108.33
4,T20I # 1823,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),c Gunathilaka b PWH de Silva,20,24,0,0,83.33
...,...,...,...,...,...,...,...,...,...,...,...
694,T20I # 1879,Pakistan Vs England,England,3,Phil Salt,c Iftikhar Ahmed b Haris Rauf,10,9,2,0,111.11
695,T20I # 1879,Pakistan Vs England,England,4,Ben Stokes,not out,52,49,5,1,106.12
696,T20I # 1879,Pakistan Vs England,England,5,Harry Brook,c Shaheen Shah Afridi b Shadab Khan,20,23,1,0,86.95
697,T20I # 1879,Pakistan Vs England,England,6,Moeen Ali,b Mohammad Wasim,19,13,3,0,146.15


In [20]:
t20_wc_bowling_summary


Unnamed: 0,match_id,match,team,bowler_name,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,no_balls
0,T20I # 1823,Namibia Vs Sri Lanka,Namibia,Maheesh Theekshana,4,0,23,1,5.75,7,0,0,2,0
1,T20I # 1823,Namibia Vs Sri Lanka,Namibia,Dushmantha Chameera,4,0,39,1,9.75,6,3,1,2,0
2,T20I # 1823,Namibia Vs Sri Lanka,Namibia,Pramod Madushan,4,0,37,2,9.25,6,3,1,0,0
3,T20I # 1823,Namibia Vs Sri Lanka,Namibia,Chamika Karunaratne,4,0,36,1,9.00,7,3,1,1,0
4,T20I # 1823,Namibia Vs Sri Lanka,Namibia,Wanindu Hasaranga,4,0,27,1,6.75,8,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,T20I # 1879,Pakistan Vs England,Pakistan,Naseem Shah,4,0,30,0,7.50,15,3,1,1,0
496,T20I # 1879,Pakistan Vs England,Pakistan,Haris Rauf,4,0,23,2,5.75,13,3,0,1,0
497,T20I # 1879,Pakistan Vs England,Pakistan,Shadab Khan,4,0,20,1,5.00,10,1,0,0,0
498,T20I # 1879,Pakistan Vs England,Pakistan,Mohammad Wasim,4,0,38,1,9.50,5,5,0,2,0


In [21]:
t20_wc_player_info

Unnamed: 0,name,image,team,batting_style,bowling_style,playing_role,description
0,Yasir Ali,"https://img1.hscicdn.com/image/upload/f_auto,t...",Bangladesh,Right hand Bat,Right arm Offbreak,Middle order Batter,
1,Mosaddek Hossain,"https://img1.hscicdn.com/image/upload/f_auto,t...",Bangladesh,Right hand Bat,Right arm Offbreak,Middle order Batter,"Mosaddek Hossain, who comes from a family of c..."
2,Muhammad Waseem,"https://img1.hscicdn.com/image/upload/f_auto,t...",U.A.E.,Right hand Bat,Right arm Medium,Opening Batter,
3,Josh Davey,"https://img1.hscicdn.com/image/upload/f_auto,t...",Scotland,Right hand Bat,Right arm Medium fast,Bowler,Josh Davey's seam-bowling talent was recognise...
4,Temba Bavuma,"https://img1.hscicdn.com/image/upload/f_auto,t...",South Africa,Right hand Bat,Right arm Medium,Middle order Batter,"From the same street in Langa, a township outs..."
...,...,...,...,...,...,...,...
208,Mark Wood,"https://img1.hscicdn.com/image/upload/f_auto,t...",England,Right hand Bat,Right arm Fast,Bowler,"It looked, for a while, as if Mark Wood would ..."
209,Mustafizur Rahman,"https://img1.hscicdn.com/image/upload/f_auto,t...",Bangladesh,Left hand Bat,Left arm Fast medium,Bowler,Mustafizur Rahman is a left-arm pace bowler wh...
210,Timm van der Gugten,"https://img1.hscicdn.com/image/upload/f_auto,t...",Netherlands,Right hand Bat,Right arm Fast medium,Bowler,"Timm van der Gugten, born in Sydney, made his ..."
211,Paul Stirling,"https://img1.hscicdn.com/image/upload/f_auto,t...",Ireland,Right hand Bat,Right arm Offbreak,Batting Allrounder,Paul Stirling has long been recognised as a fo...
