In [44]:
# importing required libraries
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By

# exceptions
from selenium.common.exceptions import NoSuchElementException

In [45]:
## function to initialize and configure webdriver
def configure_firefox_driver():
    # add additional Options to the webdriver
    firefox_options = FirefoxOptions()
    # add the argument and make the browser Headless.
    firefox_options.add_argument("--headless")
    # instantiate the Webdriver
    driver = webdriver.Firefox(executable_path=r"geckodriver.exe", options = firefox_options)
    driver.maximize_window()
    
    return driver

In [46]:
# create the driver object

In [47]:
## function to get profile data from the cricinfo website
def get_profile_data(driver, url):
    # make a GET request to fetch the raw HTML content
    driver.get(url)
    
    return driver

In [48]:
## PERSONAL INFORMATION

# empty lists to store the required data
player_id = []
full_name = []
born = []
age = [] 
batting_style = []
bowling_style = []
playing_role = []
education = []
height = []
relations = []
fielding_position = []
nick_name = []
died = []
teams = []

"""
## function to scrape players' personal information 

input:
driver object

output: 
lists containing personal information
""" 

def get_personal_data(driver):

    # empty dict to store profile details
    profile_details = {}
    
    # locate personal info div
    personal_div = driver.find_element(By.XPATH, '//*[@id="main-container"]/div[5]/div[1]/div[2]/div[2]/div[2]/div/div/div[1]')
    # locate attributes divs
    player_attributes = personal_div.find_elements(By.TAG_NAME, "p")

    # iterate through player_attributes 
    for attr in player_attributes:
        # handle exception regarding 'RELATIONS' attribute
        if attr.get_attribute("class") == "ds-text-tight-m ds-font-regular ds-text-uppercase ds-text-ui-typo-mid" and attr.text == 'RELATIONS':
            # locate parent div 
            parent_div = attr.find_element(By.XPATH, "..")
            p_text = parent_div.text
            p_text = p_text.split("RELATIONS")[1]
            t_text = p_text.split("\n")
            relation = ""
            for k in range(0, len(p_text), 2):
                relation = relation + p_text[k]

                # if relation type exists
                if k+1 < len(p_text):
                    relation = relation + p_text[k+1]
                
            profile_details['RELATIONS'] = relation
                
        else:
            parent_div = attr.find_element(By.XPATH, "..")
            # locate attribute label
            label = parent_div.find_element(By.TAG_NAME, "span")
            profile_details[attr.text] = label.text   
            
    # teams
    team_names = ""
    # handle no teams exception
    try:
        # locate teams div
        teams_div = driver.find_element(By.XPATH, '//div/p[text() = "TEAMS"]')
        parent_div = teams_div.find_element(By.XPATH, "..")
        innerdivs = parent_div.find_elements(By.TAG_NAME, "div")
        for div in range(len(innerdivs)):
            if innerdivs[div].get_attribute("class") == "ds-flex ds-items-center ds-space-x-4":
                team_names = team_names + innerdivs[div].text + ","
        # trim last comma        
        team_names = team_names[:-1] 
    except NoSuchElementException:
        team_names = " "                 
            
    # add all the personal_data in the respective lists column-wise
    # PLAYER_ID
    player_id.append((driver.current_url).split('-')[-1])

    # FULL NAME             
    if "FULL NAME" in profile_details:
        full_name.append(profile_details['FULL NAME'])
    else:
        full_name.append("-")

    # BORN    
    if "BORN" in profile_details:
        born.append(profile_details['BORN'])
    else:
        born.append("-")

    # AGE    
    if "AGE" in profile_details:
        age.append(profile_details["AGE"])
    else:
        age.append("-")

    # BATTING STYLE    
    if "BATTING STYLE" in profile_details:
        batting_style.append(profile_details["BATTING STYLE"])
    else:
        batting_style.append("-")

    # BOWLING STYLE    
    if "BOWLING STYLE" in profile_details:
        bowling_style.append(profile_details["BOWLING STYLE"])
    else:
        bowling_style.append("-")

    # RELATIONS
    if "RELATIONS" in profile_details:
        relations.append(profile_details["RELATIONS"])
    else:
        relations.append("-")

    # PLAYING ROLE    
    if "PLAYING ROLE" in profile_details:
        playing_role.append(profile_details["PLAYING ROLE"])
    else:
        playing_role.append("-")

    # FIELDING POSITION
    if "FIELDING POSITION" in profile_details:
        fielding_position.append(profile_details["FIELDING POSITION"])
    else:
        fielding_position.append("-")

    # EDUCATION    
    if "EDUCATION" in profile_details:
        education.append(profile_details["EDUCATION"])
    else:
        education.append("-")

    # HEIGHT    
    if "HEIGHT" in profile_details:
        height.append(profile_details["HEIGHT"])
    else:
        height.append("-")
        
    # NICK NAME   
    if "NICKNAMES" in profile_details:
        nick_name.append(profile_details["NICKNAMES"])
    else:
        nick_name.append("-")    

    # DIED    
    if "DIED" in profile_details:
        died.append(profile_details["DIED"])
    else:
        died.append("-")
        
    # TEAMS
    teams.append(team_names)    

In [49]:
## BATTING AND FIELDING

# empty lists to store the required data
# odi_bat
odi_bat_matches = []
odi_bat_innings = []
odi_bat_not_outs = []
odi_bat_runs = []
odi_bat_hs = []
odi_bat_averages = []
odi_bat_balls_faced = []
odi_bat_strike_rates = []
odi_bat_centuries = []
odi_bat_fifties = []
odi_bat_fours = []
odi_bat_sixes = []
odi_bat_catches_taken = []
odi_bat_stumps_broken = []

# t20i_bat
t20i_bat_matches = []
t20i_bat_innings = []
t20i_bat_not_outs = []
t20i_bat_runs = []
t20i_bat_hs = []
t20i_bat_averages = []
t20i_bat_balls_faced = []
t20i_bat_strike_rates = []
t20i_bat_centuries = []
t20i_bat_fifties = []
t20i_bat_fours = []
t20i_bat_sixes = []
t20i_bat_catches_taken = []
t20i_bat_stumps_broken = []

"""
## function to scrape players' batting and fielding data 

input:
driver object

output: 
lists containing batting and fielding data
""" 
def get_batting_data(driver):
    # flags to check the existance of certain element
    odi_flag = False
    t20_flag = False
    # locate 'Batting & Fielding' div
    # Batting & Fielding
    bat_div = driver.find_element(By.XPATH, "//div/h5[text()='Batting & Fielding']/..")
    # locate batting table body
    bat_table_body = bat_div.find_element(By.XPATH, "./div/table/tbody")
    # locate batting table rows
    bat_table_rows = bat_table_body.find_elements(By.XPATH, "./tr")
    
    # iterate through the table rows
    for row in bat_table_rows:
        # for ODIs
        if 'ODI' not in row.text: 
            continue
        elif 'ODI' in row.text:
            # set 't20_flag' to True i.e. t20 row exists
            odi_flag = True 

            tds = row.find_elements(By.TAG_NAME, "td")
            if len(tds)==15:
                
            # add all the odi_bat_data in the respective lists column-wise
                odi_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                odi_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                odi_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                odi_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                odi_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                odi_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                odi_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                odi_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                odi_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                odi_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                odi_bat_fours.append(row.find_element(By.XPATH, './td[12]/span').text)
                odi_bat_sixes.append(row.find_element(By.XPATH, './td[13]/span').text)
                odi_bat_catches_taken.append(row.find_element(By.XPATH, './td[14]/span').text)
                odi_bat_stumps_broken.append(row.find_element(By.XPATH, './td[15]/span').text)
            
                break
            elif len(tds)==14:
                odi_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                odi_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                odi_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                odi_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                odi_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                odi_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                odi_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                odi_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                odi_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                odi_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                odi_bat_fours.append('-')
                odi_bat_sixes.append(row.find_element(By.XPATH, './td[12]/span').text)
                odi_bat_catches_taken.append(row.find_element(By.XPATH, './td[13]/span').text)
                odi_bat_stumps_broken.append(row.find_element(By.XPATH, './td[14]/span').text)
                break
            elif len(tds)<=13:
                odi_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                odi_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                odi_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                odi_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                odi_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                odi_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                odi_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                odi_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                odi_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                odi_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                odi_bat_fours.append('-')
                odi_bat_sixes.append('-')
                odi_bat_catches_taken.append(row.find_element(By.XPATH, './td[12]/span').text)
                odi_bat_stumps_broken.append(row.find_element(By.XPATH, './td[13]/span').text)
                break


    if odi_flag != True:
        
        
        # add empty strings in the respective lists column-wise
        odi_bat_matches.append("-")
        odi_bat_innings.append("-")
        odi_bat_not_outs.append("-")
        odi_bat_runs.append("-")
        odi_bat_hs.append("-")
        odi_bat_averages.append("-")
        odi_bat_balls_faced.append("-")
        odi_bat_strike_rates.append("-")
        odi_bat_centuries.append("-")
        odi_bat_fifties.append("-")
        odi_bat_fours.append("-")
        odi_bat_sixes.append("-")
        odi_bat_catches_taken.append("-")
        odi_bat_stumps_broken.append("-")    
    
    # iterate through the table rows
    for row in bat_table_rows:    
        # for T20Is
        if 'T20I' not in row.text: 
            continue
        elif 'T20I' in row.text: 
            # set 't20_flag' to True i.e. t20 row exists
            t20_flag = True
            tds = row.find_elements(By.TAG_NAME, "td")
            if len(tds)==15:
                
            # add all the t20i_bat_data in the respective lists column-wise
                t20i_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                t20i_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                t20i_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                t20i_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                t20i_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                t20i_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                t20i_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                t20i_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                t20i_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                t20i_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                t20i_bat_fours.append(row.find_element(By.XPATH, './td[12]/span').text)
                t20i_bat_sixes.append(row.find_element(By.XPATH, './td[13]/span').text)
                t20i_bat_catches_taken.append(row.find_element(By.XPATH, './td[14]/span').text)
                t20i_bat_stumps_broken.append(row.find_element(By.XPATH, './td[15]/span').text)
                
                break   
            elif len(tds)==14:
                t20i_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                t20i_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                t20i_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                t20i_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                t20i_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                t20i_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                t20i_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                t20i_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                t20i_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                t20i_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                t20i_bat_fours.append('-')
                t20i_bat_sixes.append(row.find_element(By.XPATH, './td[12]/span').text)
                t20i_bat_catches_taken.append(row.find_element(By.XPATH, './td[13]/span').text)
                t20i_bat_stumps_broken.append(row.find_element(By.XPATH, './td[14]/span').text)
                break
            elif len(tds)<=13:
                t20i_bat_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
                t20i_bat_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
                t20i_bat_not_outs.append(row.find_element(By.XPATH, './td[4]/span').text)
                t20i_bat_runs.append(row.find_element(By.XPATH, './td[5]/span').text)
                t20i_bat_hs.append(row.find_element(By.XPATH, './td[6]/span').text)
                t20i_bat_averages.append(row.find_element(By.XPATH, './td[7]/span').text)
                t20i_bat_balls_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
                t20i_bat_strike_rates.append(row.find_element(By.XPATH, './td[9]/span').text)
                t20i_bat_centuries.append(row.find_element(By.XPATH, './td[10]/span').text)
                t20i_bat_fifties.append(row.find_element(By.XPATH, './td[11]/span').text)
                t20i_bat_fours.append('-')
                t20i_bat_sixes.append('-')
                t20i_bat_catches_taken.append(row.find_element(By.XPATH, './td[12]/span').text)
                t20i_bat_stumps_broken.append(row.find_element(By.XPATH, './td[13]/span').text)
                break
                   
    
    if t20_flag != True:
        
        
        # add empty strings in the respective lists column-wise
        t20i_bat_matches.append("-")
        t20i_bat_innings.append("-")
        t20i_bat_not_outs.append("-")
        t20i_bat_runs.append("-")
        t20i_bat_hs.append("-")
        t20i_bat_averages.append("-")
        t20i_bat_balls_faced.append("-")
        t20i_bat_strike_rates.append("-")
        t20i_bat_centuries.append("-")
        t20i_bat_fifties.append("-")
        t20i_bat_fours.append("-")
        t20i_bat_sixes.append("-")
        t20i_bat_catches_taken.append("-")
        t20i_bat_stumps_broken.append("-") 
        

In [50]:
## BOWLING

# empty lists to store the required data
# odi_bowl
odi_bowl_matches = []
odi_bowl_innings = []
odi_bowl_balls_bowled = []
odi_bowl_runs_conceeded = []
odi_bowl_wickets = []
odi_bowl_bbi = []
odi_bowl_bbm_faced = []
odi_bowl_average = []
odi_bowl_economy = []
odi_bowl_strike_rates = []
odi_bowl_four_wickets = []
odi_bowl_five_wickets = []
odi_bowl_10_wickets = []

# t20i_bowl
t20i_bowl_matches = []
t20i_bowl_innings = []
t20i_bowl_balls_bowled = []
t20i_bowl_runs_conceeded = []
t20i_bowl_wickets = []
t20i_bowl_bbi = []
t20i_bowl_bbm_faced = []
t20i_bowl_average = []
t20i_bowl_economy = []
t20i_bowl_strike_rates = []
t20i_bowl_four_wickets = []
t20i_bowl_five_wickets = []
t20i_bowl_10_wickets = []

"""
## function to scrape players' bowling data 

input:
driver object

output: 
lists containing bowling data
""" 
def get_bowling_data(driver):
    # flags to check the existance of certain element
    odi_flag = False
    t20_flag = False
    # locate 'Bowling' div
    bowl_div = driver.find_element(By.XPATH, "//div/h5[text()='Bowling']/..")
    # locate bowling table body
    bowl_table_body = bowl_div.find_element(By.XPATH, "./div/table/tbody")
    # locate bowling table rows
    bowl_table_rows = bowl_table_body.find_elements(By.XPATH, "./tr")
    
    # iterate through the table rows
    for row in bowl_table_rows:
        # for ODIs
        if 'ODI' not in row.text: 
            continue
        elif 'ODI' in row.text: 
            # set 't20_flag' to True i.e. t20 row exists
            odi_flag = True

            # add all the odi_bowl_data in the respective lists column-wise
            odi_bowl_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
            odi_bowl_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
            odi_bowl_balls_bowled.append(row.find_element(By.XPATH, './td[4]/span').text)
            odi_bowl_runs_conceeded.append(row.find_element(By.XPATH, './td[5]/span').text)
            odi_bowl_wickets.append(row.find_element(By.XPATH, './td[6]/span').text)
            odi_bowl_bbi.append(row.find_element(By.XPATH, './td[7]/span').text)
            odi_bowl_bbm_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
            odi_bowl_average.append(row.find_element(By.XPATH, './td[9]/span').text)
            odi_bowl_economy.append(row.find_element(By.XPATH, './td[10]/span').text)
            odi_bowl_strike_rates.append(row.find_element(By.XPATH, './td[11]/span').text)
            odi_bowl_four_wickets.append(row.find_element(By.XPATH, './td[12]/span').text)
            odi_bowl_five_wickets.append(row.find_element(By.XPATH, './td[13]/span').text)
            odi_bowl_10_wickets.append(row.find_element(By.XPATH, './td[14]/span').text)
           
            break

    if odi_flag != True:

        # add empty strings in the respective lists column-wise
        odi_bowl_matches.append("-")
        odi_bowl_innings.append("-")
        odi_bowl_balls_bowled.append("-")
        odi_bowl_runs_conceeded.append("-")
        odi_bowl_wickets.append("-")
        odi_bowl_bbi.append("-")
        odi_bowl_bbm_faced.append("-")
        odi_bowl_average.append("-")
        odi_bowl_economy.append("-")
        odi_bowl_strike_rates.append("-")
        odi_bowl_four_wickets.append("-")
        odi_bowl_five_wickets.append("-")
        odi_bowl_10_wickets.append("-")
    
    # iterate through the table rows
    for row in bowl_table_rows:    
        # for T20Is
        if 'T20I' not in row.text: 
            continue
        elif 'T20I' in row.text: 
            # set 't20_flag' to True i.e. t20 row exists
            t20_flag = True
            
            # add all the t20i_bowl_data in the respective lists column-wise
            t20i_bowl_matches.append(row.find_element(By.XPATH, './td[2]/span').text)
            t20i_bowl_innings.append(row.find_element(By.XPATH, './td[3]/span').text)
            t20i_bowl_balls_bowled.append(row.find_element(By.XPATH, './td[4]/span').text)
            t20i_bowl_runs_conceeded.append(row.find_element(By.XPATH, './td[5]/span').text)
            t20i_bowl_wickets.append(row.find_element(By.XPATH, './td[6]/span').text)
            t20i_bowl_bbi.append(row.find_element(By.XPATH, './td[7]/span').text)
            t20i_bowl_bbm_faced.append(row.find_element(By.XPATH, './td[8]/span').text)
            t20i_bowl_average.append(row.find_element(By.XPATH, './td[9]/span').text)
            t20i_bowl_economy.append(row.find_element(By.XPATH, './td[10]/span').text)
            t20i_bowl_strike_rates.append(row.find_element(By.XPATH, './td[11]/span').text)
            t20i_bowl_four_wickets.append(row.find_element(By.XPATH, './td[12]/span').text)
            t20i_bowl_five_wickets.append(row.find_element(By.XPATH, './td[13]/span').text)
            t20i_bowl_10_wickets.append(row.find_element(By.XPATH, './td[14]/span').text)
        
            break

    if t20_flag != True:
        

        # add empty strings in the respective lists column-wise
        t20i_bowl_matches.append("-")
        t20i_bowl_innings.append("-")
        t20i_bowl_balls_bowled.append("-")
        t20i_bowl_runs_conceeded.append("-")
        t20i_bowl_wickets.append("-")
        t20i_bowl_bbi.append("-")
        t20i_bowl_bbm_faced.append("-")
        t20i_bowl_average.append("-")
        t20i_bowl_economy.append("-")
        t20i_bowl_strike_rates.append("-")
        t20i_bowl_four_wickets.append("-")
        t20i_bowl_five_wickets.append("-")
        t20i_bowl_10_wickets.append("-")

In [51]:
## RECORDS

# empty lists to store the required data
records = []

"""
## function to scrape players' records data

input:
driver object

output: 
list containing records data
""" 

def get_records(driver):

    rec_details = "-"
    # locate 'Records' div
    inner_divs = driver.find_elements(By.XPATH, "//div/span[text() = 'Records']/..")

    # if records div exists
    if len(inner_divs) > 0:
        rec_details = ""
        # locate parent div of 'inner_divs' using '..' notation
        inner_divs = inner_divs[0].find_element(By.XPATH, "..")
        parent_divs = inner_divs.find_element(By.XPATH, "..")

        # locate header string 'h1'
        record = parent_divs.find_elements(By.TAG_NAME, "h1")

        # if record's header string exists
        if len(record) > 0:
            # locate record's details
            details = parent_divs.find_elements(By.TAG_NAME, "p")
            k = 0

            for rec in range(len(record)):     
                rec_details = rec_details + record[rec].text + " : " + details[k].text + "," + details[k+1].text + "\n"
                    
                k = k + 2
               
            records.append(rec_details) 
    else:
        # else append " " string 
        records.append(rec_details) 
                

In [52]:

## DEBUT/LAST MATCHES - PLAYER

# empty lists to store the required data
odi_debut_date = []
odi_retirement_date = []
t20i_debut_date = []
t20i_retirement_date = []

"""
## function to find debut and retirement details of the player

input:
driver object

output: 
list containing debut/last matches data
"""

def get_debut_details(driver):

    # for ODIs
    odi_debut_rows = []
    
    # locate 'ODI Matches' text div
    inner_divs = driver.find_elements(By.XPATH,"//*[text() = 'ODI Matches']")
    
    # if ODI debut records exist
    if len(inner_divs) > 0:
        # locate parent divs
        parent_divs = inner_divs[0].find_element(By.XPATH, "..")
        format_divs = parent_divs.find_element(By.XPATH, '..')
        debut_divs = format_divs.find_elements(By.TAG_NAME, "div")

        # iterate through each <span> in the debut_divs
        for span in debut_divs:
            if span.get_attribute("class") == "ds-grid lg:ds-grid-cols-3 ds-border-b ds-border-line last:ds-border-0":
                odi_debut_rows.append(span)
    # end if

    # case-1: both debut and last
    if len(odi_debut_rows) > 1:
        odi_debut_date.append(odi_debut_rows[0].text.split("\n")[1])
        odi_retirement_date.append(odi_debut_rows[1].text.split("\n")[1])
    # case-2: only debut
    elif len(odi_debut_rows) == 1:
        odi_debut_date.append(odi_debut_rows[0].text.split("\n")[1])
        odi_retirement_date.append(odi_debut_rows[0].text.split("\n")[1])     
    # case-3: none 
    else:
        odi_debut_date.append("-")
        odi_retirement_date.append("-")

                
    # for T20Is
    t20i_debut_rows = []
    
    # locate 't20i Matches' text div
    inner_divs = driver.find_elements(By.XPATH,"//*[text() = 'T20I Matches']")
    
    # if t20i debut records exist
    if len(inner_divs) > 0:
        # locate parent divs
        parent_divs = inner_divs[0].find_element(By.XPATH, "..")
        format_divs = parent_divs.find_element(By.XPATH, '..')
        debut_divs = format_divs.find_elements(By.TAG_NAME, "div")

        # iterate through each <span> in the debut_divs
        for span in debut_divs:
            if span.get_attribute("class") == "ds-grid lg:ds-grid-cols-3 ds-border-b ds-border-line last:ds-border-0":
                t20i_debut_rows.append(span)
    
    # case-1: both debut and last
    if len(t20i_debut_rows) > 1:
        t20i_debut_date.append(t20i_debut_rows[0].text.split("\n")[1])
        t20i_retirement_date.append(t20i_debut_rows[1].text.split("\n")[1])
    # case-2: only debut    
    elif len(t20i_debut_rows) == 1:
        t20i_debut_date.append(t20i_debut_rows[0].text.split("\n")[1])
        t20i_retirement_date.append(t20i_debut_rows[0].text.split("\n")[1])
    # case-3: none   
    else:
        t20i_debut_date.append("-")
        t20i_retirement_date.append("-")
    # end if    

In [60]:

# player_id.pop()
# full_name.pop ()
# born.pop()
# age.pop()
# died.pop()
# batting_style.pop()
# bowling_style.pop()
# fielding_position.pop() 
# playing_role.pop()
# education.pop()
# relations.pop()
# nick_name.pop()
# height.pop()
# teams.pop()
# full_name.pop ()
# born.pop()
# age.pop()
# died.pop()
# batting_style.pop()
# bowling_style.pop()
# fielding_position.pop() 
# playing_role.pop()
# education.pop()
# relations.pop()
# nick_name.pop()
# height.pop()
# teams.pop()

#2532,1572,

''

In [54]:
def create_file():
    player_df = pd.DataFrame({
                        'player_id':player_id,
                        'full_name':full_name,
                        'born':born,
                        'age':age, 
                        'died':died,
                        'batting_style':batting_style,
                        'bowling_style':bowling_style,
                        'fielding_position':fielding_position, 
                        'playing_role':playing_role, 
                        'education':education, 
                        'relations':relations,
                        'nick_name':nick_name,
                        'height':height,
                        'teams':teams,
                        # odi_bat
                        'odi_bat_mat':odi_bat_matches,
                        'odi_bat_inns':odi_bat_innings,
                        'odi_bat_no':odi_bat_not_outs,
                        'odi_bat_runs':odi_bat_runs,
                        'odi_bat_hs':odi_bat_hs,
                        'odi_bat_ave':odi_bat_averages,
                        'odi_bat_bf':odi_bat_balls_faced,
                        'odi_bat_sr':odi_bat_strike_rates,
                        'odi_bat_100s':odi_bat_centuries,
                        'odi_bat_50s':odi_bat_fifties,
                        'odi_bat_4s':odi_bat_fours,
                        'odi_bat_6s':odi_bat_sixes,
                        'odi_bat_ct':odi_bat_catches_taken,
                        'odi_bat_st':odi_bat_stumps_broken,
                        # t20i_bat
                        't20i_bat_mat':t20i_bat_matches,
                        't20i_bat_inns':t20i_bat_innings,
                        't20i_bat_no':t20i_bat_not_outs,
                        't20i_bat_runs':t20i_bat_runs,
                        't20i_bat_hs':t20i_bat_hs,
                        't20i_bat_ave':t20i_bat_averages,
                        't20i_bat_bf':t20i_bat_balls_faced,
                        't20i_bat_sr':t20i_bat_strike_rates,
                        't20i_bat_100s':t20i_bat_centuries,
                        't20i_bat_50s':t20i_bat_fifties,
                        't20i_bat_4s':t20i_bat_fours,
                        't20i_bat_6s':t20i_bat_sixes,
                        't20i_bat_ct':t20i_bat_catches_taken,
                        't20i_bat_st':t20i_bat_stumps_broken,
                        # odi_bowl
                        'odi_bowl_matches':odi_bowl_matches,
                        'odi_bowl_innings':odi_bowl_innings,
                        'odi_bowl_balls_bowled':odi_bowl_balls_bowled,
                        'odi_bowl_runs_conceeded':odi_bowl_runs_conceeded,
                        'odi_bowl_wickets':odi_bowl_wickets,
                        'odi_bowl_bbi':odi_bowl_bbi,
                        'odi_bowl_bbm_faced':odi_bowl_bbm_faced,
                        'odi_bowl_average':odi_bowl_average,
                        'odi_bowl_economy':odi_bowl_economy,
                        'odi_bowl_strike_rates':odi_bowl_strike_rates,
                        'odi_bowl_four_wickets':odi_bowl_four_wickets,
                        'odi_bowl_five_wickets':odi_bowl_five_wickets,
                        'odi_bowl_10_wickets':odi_bowl_10_wickets,
                        # t20i_bowl
                        't20i_bowl_matches':t20i_bowl_matches,
                        't20i_bowl_innings':t20i_bowl_innings,
                        't20i_bowl_balls_bowled':t20i_bowl_balls_bowled,
                        't20i_bowl_runs_conceeded':t20i_bowl_runs_conceeded,
                        't20i_bowl_wickets':t20i_bowl_wickets,
                        't20i_bowl_bbi':t20i_bowl_bbi,
                        't20i_bowl_bbm_faced':t20i_bowl_bbm_faced,
                        't20i_bowl_average':t20i_bowl_average,
                        't20i_bowl_economy':t20i_bowl_economy,
                        't20i_bowl_strike_rates':t20i_bowl_strike_rates,
                        't20i_bowl_four_wickets':t20i_bowl_four_wickets,
                        't20i_bowl_five_wickets':t20i_bowl_five_wickets,
                        't20i_bowl_10_wickets':t20i_bowl_10_wickets,
                        'records':records,
                        'odi_debut_date':odi_debut_date,
                        'odi_retirement_date':odi_retirement_date,
                        't20i_debut_date':t20i_debut_date,
                        't20i_retirement_date':t20i_retirement_date
                        })
    player_df.head()
    player_df.to_csv("player_profiles_parsed.csv", index=False)

134

In [61]:
## Main Controller
import sys
"""
The main controller reads odi and t20 profile_urls csv files, containg profile_urls of odi and t20 players.
A single list is prepared from both files in which no url is repeated.For each iteration, it scrapes and parses
respective player's profile.
"""

profile_urls = pd.read_csv('players_profiles.csv')
profile_urls = profile_urls['url'].to_list()
count = len(profile_urls)
count
driver = configure_firefox_driver()
# loop through all profile_urls
for k in range(2666,count):
    # test-case
    
    print(k)
   
        
        
    data_driver = get_profile_data(driver, profile_urls[k])
    get_personal_data(data_driver)
    get_batting_data(data_driver)
    get_bowling_data(data_driver)
    get_records(data_driver)
    get_debut_details(data_driver)
    create_file()
    
# close the driver
driver.close()

  driver = webdriver.Firefox(executable_path=r"geckodriver.exe", options = firefox_options)


2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681


In [63]:
df_1 = pd.read_csv('player_profiles_parsed_Copy_1.csv')
df_2 = pd.read_csv('player_profiles_parsed_Copy_2.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_3.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_4.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_5.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_6.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_7.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_8.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_9.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_10.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_11.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_12.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_13.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_14.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_15.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_16.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_17.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_18.csv')
df_1 = pd.concat([df_1,df_2])
df_2 = pd.read_csv('player_profiles_parsed_Copy_19.csv')
df_1 = pd.concat([df_1,df_2])

df_1.to_csv('parsed_profiles.csv')


In [None]:
#print(len(player_id),len(full_name),len(born),len(age),len(batting_style),len(bowling_style),len(fielding_position),len(playing_role))
#print(len(education),len(relations),len(nick_name),len(height),len(teams))

In [None]:

# main_dict = {
#                         'player_id':player_id,
#                         'full_name':full_name,
#                         'born':born,
#                         'age':age, 
#                         'died':died,
#                         'batting_style':batting_style,
#                         'bowling_style':bowling_style,
#                         'fielding_position':fielding_position, 
#                         'playing_role':playing_role, 
#                         'education':education, 
#                         'relations':relations,
#                         'nick_name':nick_name,
#                         'height':height,
#                         'teams':teams,
#                         # odi_bat
#                         'odi_bat_mat':odi_bat_matches,
#                         'odi_bat_inns':odi_bat_innings,
#                         'odi_bat_no':odi_bat_not_outs,
#                         'odi_bat_runs':odi_bat_runs,
#                         'odi_bat_hs':odi_bat_hs,
#                         'odi_bat_ave':odi_bat_averages,
#                         'odi_bat_bf':odi_bat_balls_faced,
#                         'odi_bat_sr':odi_bat_strike_rates,
#                         'odi_bat_100s':odi_bat_centuries,
#                         'odi_bat_50s':odi_bat_fifties,
#                         'odi_bat_4s':odi_bat_fours,
#                         'odi_bat_6s':odi_bat_sixes,
#                         'odi_bat_ct':odi_bat_catches_taken,
#                         'odi_bat_st':odi_bat_stumps_broken,
#                         # t20i_bat
#                         't20i_bat_mat':t20i_bat_matches,
#                         't20i_bat_inns':t20i_bat_innings,
#                         't20i_bat_no':t20i_bat_not_outs,
#                         't20i_bat_runs':t20i_bat_runs,
#                         't20i_bat_hs':t20i_bat_hs,
#                         't20i_bat_ave':t20i_bat_averages,
#                         't20i_bat_bf':t20i_bat_balls_faced,
#                         't20i_bat_sr':t20i_bat_strike_rates,
#                         't20i_bat_100s':t20i_bat_centuries,
#                         't20i_bat_50s':t20i_bat_fifties,
#                         't20i_bat_4s':t20i_bat_fours,
#                         't20i_bat_6s':t20i_bat_sixes,
#                         't20i_bat_ct':t20i_bat_catches_taken,
#                         't20i_bat_st':t20i_bat_stumps_broken,
#                         # odi_bowl
#                         'odi_bowl_matches':odi_bowl_matches,
#                         'odi_bowl_innings':odi_bowl_innings,
#                         'odi_bowl_balls_bowled':odi_bowl_balls_bowled,
#                         'odi_bowl_runs_conceeded':odi_bowl_runs_conceeded,
#                         'odi_bowl_wickets':odi_bowl_wickets,
#                         'odi_bowl_bbi':odi_bowl_bbi,
#                         'odi_bowl_bbm_faced':odi_bowl_bbm_faced,
#                         'odi_bowl_average':odi_bowl_average,
#                         'odi_bowl_economy':odi_bowl_economy,
#                         'odi_bowl_strike_rates':odi_bowl_strike_rates,
#                         'odi_bowl_four_wickets':odi_bowl_four_wickets,
#                         'odi_bowl_five_wickets':odi_bowl_five_wickets,
#                         'odi_bowl_10_wickets':odi_bowl_10_wickets,
#                         # t20i_bowl
#                         't20i_bowl_matches':t20i_bowl_matches,
#                         't20i_bowl_innings':t20i_bowl_innings,
#                         't20i_bowl_balls_bowled':t20i_bowl_balls_bowled,
#                         't20i_bowl_runs_conceeded':t20i_bowl_runs_conceeded,
#                         't20i_bowl_wickets':t20i_bowl_wickets,
#                         't20i_bowl_bbi':t20i_bowl_bbi,
#                         't20i_bowl_bbm_faced':t20i_bowl_bbm_faced,
#                         't20i_bowl_average':t20i_bowl_average,
#                         't20i_bowl_economy':t20i_bowl_economy,
#                         't20i_bowl_strike_rates':t20i_bowl_strike_rates,
#                         't20i_bowl_four_wickets':t20i_bowl_four_wickets,
#                         't20i_bowl_five_wickets':t20i_bowl_five_wickets,
#                         't20i_bowl_10_wickets':t20i_bowl_10_wickets,
#                         'records':records,
#                         'odi_debut_date':odi_debut_date,
#                         'odi_retirement_date':odi_retirement_date,
#                         't20i_debut_date':t20i_debut_date,
#                         't20i_retirement_date':t20i_retirement_date
#                         }
# main_list = []
# for key in main_dict.keys():
#     main_list.append(str(key)+'.csv')
    
# for value in main_list:
#     name = value.split('.')[0]
#     df = pd.DataFrame({name:main_dict[name]})
#     df.to_csv(value)