In [1]:
#Importing Required Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time

In [2]:
#Setting Options for Selenium Webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')

In [3]:
#Base Url is main page of the website we need to scrape
base_url = "https://www.espncricinfo.com/records/season/team-match-results/2022to23-2022to23?trophy=89"

In [4]:
# this function will check whether the player image is available and if yes return its url
def player_image(driver):
    try:
        player_image = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[1]/div[1]/div/div/div/div[1]/div/div[2]/div/div/img')
    except NoSuchElementException:
        return 'nan'
    else:
        return player_image.get_attribute('src')

In [5]:
#this function will give all the credentials of the player, including bowling and batting style
def get_profile(driver):
    image_url = player_image(driver)
    team =  driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[2]/div[2]/div/div/div[2]/div/a[1]/span/span').text
    name = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[1]/div[1]/div/div/div/div[1]/div/div[1]/h1').text
    skills = driver.find_elements(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[2]/div[2]/div/div/div[1]')
    skill_entries = skills[0].text.split('\n')[::2]
    skill_values = skills[0].text.split('\n')[1::2]
    bating = 'nan'
    bowling = 'nan'
    role = 'nan'
    for i in range(len(skill_entries)):
        if skill_entries[i] == "BATTING STYLE":
            bating = skill_values[i] 
        elif skill_entries[i] == "BOWLING STYLE":
            bowling = skill_values[i] 
        elif skill_entries[i] == "PLAYING ROLE":
            role = skill_values[i]
    driver.quit()
    return name, team, bating, bowling, role, image_url
    

In [6]:
#this function will open the player profile in selenium and check for pop-up and proceed basis the outcome
def player_profile(url):
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(15)
    try:
        pop_up =driver.find_element(By.CSS_SELECTOR, 'button#wzrk-cancel')
    except NoSuchElementException:
        return get_profile(driver)
    else:
        pop_up.click()
        time.sleep(1)
        return get_profile(driver)

In [7]:
#get_content will give the html page data of the website
def get_content(url):
    response = requests.get(url)
    # if status code is other than 200 the process will not move further
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"The URL {url} is invalid")

In [8]:
#match url is the web address of any particular match, match data will provide all the summary of the match
def match_data(match_url):
    match_content = get_content(match_url)
    #team between which the match is going
    teams = match_content.find_all('span', {"class": "ds-text-title-xs ds-font-bold ds-capitalize"})
    team1 = teams[0].text
    team2 = teams[1].text
    #match_summary content the whole score card of the match for both the innings, we use 'tbody' as the selector for selecting this part of the website
    match_summary = match_content.find_all('tbody')[:4]
    
    match = f"{team1} Vs {team2}"
    
    #Inning list arranged in a way how the team going to bat and bowl in order
    innings = [team1, team2, team2, team1]
    
    #looping in match_summary so the we can extract data from each part(batting and bowling from both side)
    for i in range(len(match_summary)):  
        if i % 2 == 0:
            #Batting Inning
            bat_pos = 1
            for player in match_summary[i]: 
                if player.contents[0].text.strip().lower() == 'extras' or player.contents[0].text.strip().lower() == 'total':
                    break
                else:
                    # Using try and except block as it have hidden tag which have value which is unrelated to our data
                    try:
                        int(player.text[0])
                    except ValueError:
                        batting_dict['match'].append(match)
                        batting_dict['team_inning'].append(innings[i])
                        batting_dict['batting_pos'].append(bat_pos)
                        batting_dict['player_name'].append(player.contents[0].text.strip())
                        batting_dict['player_profile'].append(f"https://www.espncricinfo.com{player.contents[0].a['href']}")
                        player_profile_dict['profile_url'].append(f"https://www.espncricinfo.com{player.contents[0].a['href']}")
                        batting_dict['out/not_out'].append(player.contents[1].text)
                        batting_dict['run'].append(player.contents[2].text)
                        batting_dict['balls'].append(player.contents[3].text)
                        batting_dict['maiden'].append(player.contents[4].text)
                        batting_dict['4s'].append(player.contents[5].text)
                        batting_dict['6s'].append(player.contents[6].text)
                        batting_dict['strike_rate'].append(player.contents[7].text)
                        player.contents[0].text
                        bat_pos += 1
                    except IndexError:
                        continue
        else:
            #Bowling Inning
            for bowler in match_summary[i]:
                try:
                    int(bowler.text[0])
                except ValueError:     
                    bowling_dict['match'].append(match)
                    bowling_dict['bowling_team'].append(innings[i])
                    bowling_dict['bowler'].append(bowler.contents[0].text.strip())
                    bowling_dict['bowler_profile'].append(f"https://www.espncricinfo.com{bowler.contents[0].a['href']}")
                    player_profile_dict['profile_url'].append(f"https://www.espncricinfo.com{bowler.contents[0].a['href']}")
                    bowling_dict['over'].append(bowler.contents[1].text)
                    bowling_dict['maiden'].append(bowler.contents[2].text)
                    bowling_dict['runs'].append(bowler.contents[3].text)
                    bowling_dict['wickets'].append(bowler.contents[4].text)
                    bowling_dict['economy'].append(bowler.contents[5].text)
                    bowling_dict['0s'].append(bowler.contents[6].text)
                    bowling_dict['4s'].append(bowler.contents[7].text)
                    bowling_dict['6s'].append(bowler.contents[8].text)
                    bowling_dict['wides'].append(bowler.contents[9].text)
                    bowling_dict['no_balls'].append(bowler.contents[10].text)

In [9]:
all_match_content = get_content(base_url)

In [10]:
matches = {"Team1":[], "Team2":[], "Winner":[], "Margin":[], "Ground":[], "Date":[],"Match_id":[], "URL":[],}
for item in all_match_content.find_all('tr'):
    matches['Team1'].append(item.contents[0].text)
    matches['Team2'].append(item.contents[1].text)
    matches['Winner'].append(item.contents[2].text)
    matches['Margin'].append(item.contents[3].text)
    matches['Ground'].append(item.contents[4].text)
    matches['Date'].append(item.contents[5].text)
    if item.contents[6].text != 'Scorecard':
        matches['Match_id'].append(item.contents[6].text)
        matches['URL'].append(f"https://www.espncricinfo.com{item.contents[6].a['href']}")
    else:
        matches['Match_id'].append('NA')
        matches['URL'].append('NA')

In [11]:
# Making dataframe from mathces dictionary
all_matches_df =pd.DataFrame(matches)

In [12]:
# Dropping the first row
all_matches_df.drop(0, axis=0, inplace=True)

In [13]:
all_matches_df

Unnamed: 0,Team1,Team2,Winner,Margin,Ground,Date,Match_id,URL
1,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823,https://www.espncricinfo.com/series/icc-men-s-...
2,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825,https://www.espncricinfo.com/series/icc-men-s-...
3,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826,https://www.espncricinfo.com/series/icc-men-s-...
4,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828,https://www.espncricinfo.com/series/icc-men-s-...
5,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830,https://www.espncricinfo.com/series/icc-men-s-...
6,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong,"Oct 18, 2022",T20I # 1832,https://www.espncricinfo.com/series/icc-men-s-...
7,Ireland,Scotland,Ireland,6 wickets,Hobart,"Oct 19, 2022",T20I # 1833,https://www.espncricinfo.com/series/icc-men-s-...
8,West Indies,Zimbabwe,West Indies,31 runs,Hobart,"Oct 19, 2022",T20I # 1834,https://www.espncricinfo.com/series/icc-men-s-...
9,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong,"Oct 20, 2022",T20I # 1835,https://www.espncricinfo.com/series/icc-men-s-...
10,Namibia,U.A.E.,U.A.E.,7 runs,Geelong,"Oct 20, 2022",T20I # 1836,https://www.espncricinfo.com/series/icc-men-s-...


In [14]:
# Empty Dictionary for the data which needed to be fetched
batting_dict = {'match': [], 'team_inning': [], 'batting_pos': [], 'player_name': [], 'player_profile': [], 'out/not_out': [],'run': [], 'balls': [], 'maiden': [], '4s': [], '6s': [], 'strike_rate': [],}

bowling_dict = {'match': [], 'bowling_team': [], 'bowler': [], 'bowler_profile': [], 'over': [], 'maiden': [], 'runs': [], 'wickets': [], 'economy': [], '0s': [], '4s': [], '6s': [], 'wides': [], 'no_balls': [] }

player_profile_dict = {'name': [], 'team': [], 'batting_style': [], 'bowling_style': [], 'playing_role': [], 'profile_url':[], 'profile_image' : [] }

In [15]:
# Looping in all_matches_df dataframe to get the url for all the matches, so that their content can be fetched using get_content function and data using match_data function
for index, row in all_matches_df.iterrows():
    match_data(row['URL'])

In [21]:
# player profile dictionary get the profile url using match_data function, we can get these url to be used with player_profile function to get their data
# Removing all the duplicates

player_profile_dict['profile_url'] = list(set(player_profile_dict['profile_url']))
for url in player_profile_dict['profile_url']:
    player_tuple = player_profile(url)
    time.sleep(2)
    #getting all the information from tuple to palyer dictionary
    player_profile_dict['name'].append(player_tuple[0])
    player_profile_dict['team'].append(player_tuple[1])
    player_profile_dict['batting_style'].append(player_tuple[2])
    player_profile_dict['bowling_style'].append(player_tuple[3])
    player_profile_dict['playing_role'].append(player_tuple[4])
    player_profile_dict['profile_image'].append(player_tuple[5])
    

In [22]:
# Batting Data Data}frame
batting_df = pd.DataFrame(batting_dict)
batting_df

Unnamed: 0,match,team_inning,batting_pos,player_name,player_profile,out/not_out,run,balls,maiden,4s,6s,strike_rate
0,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,https://www.espncricinfo.com/cricketers/michae...,c Pramod Madushan b Chameera,3,6,7,0,0,50.00
1,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,https://www.espncricinfo.com/cricketers/divan-...,c Shanaka b Pramod Madushan,9,9,15,1,0,100.00
2,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,https://www.espncricinfo.com/cricketers/jan-ni...,c †Mendis b Karunaratne,20,12,18,1,2,166.66
3,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,https://www.espncricinfo.com/cricketers/stepha...,c DM de Silva b Pramod Madushan,26,24,49,2,0,108.33
4,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),https://www.espncricinfo.com/cricketers/gerhar...,c Gunathilaka b PWH de Silva,20,24,30,0,0,83.33
...,...,...,...,...,...,...,...,...,...,...,...,...
694,Pakistan Vs England,England,3,Phil Salt,https://www.espncricinfo.com/cricketers/phil-s...,c Iftikhar Ahmed b Haris Rauf,10,9,16,2,0,111.11
695,Pakistan Vs England,England,4,Ben Stokes,https://www.espncricinfo.com/cricketers/ben-st...,not out,52,49,81,5,1,106.12
696,Pakistan Vs England,England,5,Harry Brook,https://www.espncricinfo.com/cricketers/harry-...,c Shaheen Shah Afridi b Shadab Khan,20,23,36,1,0,86.95
697,Pakistan Vs England,England,6,Moeen Ali,https://www.espncricinfo.com/cricketers/moeen-...,b Mohammad Wasim,19,13,30,3,0,146.15


In [23]:
# Bowling Data Dataframe
bowling_df = pd.DataFrame(bowling_dict)
bowling_df

Unnamed: 0,match,bowling_team,bowler,bowler_profile,over,maiden,runs,wickets,economy,0s,4s,6s,wides,no_balls
0,Namibia Vs Sri Lanka,Sri Lanka,Maheesh Theekshana,https://www.espncricinfo.com/cricketers/mahees...,4,0,23,1,5.75,7,0,0,2,0
1,Namibia Vs Sri Lanka,Sri Lanka,Dushmantha Chameera,https://www.espncricinfo.com/cricketers/dushma...,4,0,39,1,9.75,6,3,1,2,0
2,Namibia Vs Sri Lanka,Sri Lanka,Pramod Madushan,https://www.espncricinfo.com/cricketers/pramod...,4,0,37,2,9.25,6,3,1,0,0
3,Namibia Vs Sri Lanka,Sri Lanka,Chamika Karunaratne,https://www.espncricinfo.com/cricketers/chamik...,4,0,36,1,9.00,7,3,1,1,0
4,Namibia Vs Sri Lanka,Sri Lanka,Wanindu Hasaranga,https://www.espncricinfo.com/cricketers/wanind...,4,0,27,1,6.75,8,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Pakistan Vs England,Pakistan,Naseem Shah,https://www.espncricinfo.com/cricketers/naseem...,4,0,30,0,7.50,15,3,1,1,0
496,Pakistan Vs England,Pakistan,Haris Rauf,https://www.espncricinfo.com/cricketers/haris-...,4,0,23,2,5.75,13,3,0,1,0
497,Pakistan Vs England,Pakistan,Shadab Khan,https://www.espncricinfo.com/cricketers/shadab...,4,0,20,1,5.00,10,1,0,0,0
498,Pakistan Vs England,Pakistan,Mohammad Wasim,https://www.espncricinfo.com/cricketers/mohamm...,4,0,38,1,9.50,5,5,0,2,0


In [24]:
# Player Profile Data Dataframe
player_df = pd.DataFrame(player_profile_dict)
player_df

Unnamed: 0,name,team,batting_style,bowling_style,playing_role,profile_url,profile_image
0,Simi Singh,Ireland,Right hand Bat,"Right arm Offbreak, Legbreak Googly",Bowling Allrounder,https://www.espncricinfo.com/cricketers/simi-s...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
1,Mosaddek Hossain,Bangladesh,Right hand Bat,Right arm Offbreak,Middle order Batter,https://www.espncricinfo.com/cricketers/mosadd...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
2,Tim Southee,New Zealand,Right hand Bat,Right arm Medium fast,Bowler,https://www.espncricinfo.com/cricketers/tim-so...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
3,Mark Adair,Ireland,Right hand Bat,Right arm Fast medium,Bowling Allrounder,https://www.espncricinfo.com/cricketers/mark-a...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
4,Axar Patel,India,Left hand Bat,Slow Left arm Orthodox,Bowling Allrounder,https://www.espncricinfo.com/cricketers/axar-p...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
...,...,...,...,...,...,...,...
208,Fionn Hand,Ireland,Right hand Bat,Right arm Medium,Bowling Allrounder,https://www.espncricinfo.com/cricketers/fionn-...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
209,Barry McCarthy,Ireland,Right hand Bat,Right arm Fast medium,Bowler,https://www.espncricinfo.com/cricketers/barry-...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
210,Gareth Delany,Ireland,Right hand Bat,Legbreak Googly,Batting Allrounder,https://www.espncricinfo.com/cricketers/gareth...,"https://img1.hscicdn.com/image/upload/f_auto,t..."
211,James Neesham,New Zealand,Left hand Bat,Right arm Medium fast,Batting Allrounder,https://www.espncricinfo.com/cricketers/james-...,"https://img1.hscicdn.com/image/upload/f_auto,t..."


In [25]:
# Saving all the datafram into csv files.
all_matches_df.to_csv('data/matches_data.csv')
batting_df.to_csv('data/batting_data.csv')
bowling_df.to_csv('data/bowling_data.csv')
player_df.to_csv('data/players_data.csv')