In [51]:
import zipfile
import os
import shutil
import json
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
this_file_dir = os.getcwd() + "/"

json_dir = this_file_dir + "../data/raw/cricksheet/json/"
counter = 0
counter_lock = Lock()

def get_player_details(cricinfo_id, total_players):
    global counter
    cricinfo_id = int(cricinfo_id)
    print(f"Processing player with cricinfo_id: {cricinfo_id}")
    url = f"https://www.espncricinfo.com/cricketers/player-{cricinfo_id}"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
    }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        full_name = batting_style = bowling_style = playing_role = None
        teams = []
        
        full_name_section = soup.find('div', class_="ds-col-span-2 lg:ds-col-span-1")
        if full_name_section:
            name_label = full_name_section.find('p', class_="ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3")
            if name_label and name_label.text == "Full Name":
                full_name = full_name_section.find('span', class_="ds-text-title-s ds-font-bold ds-text-typo").text.strip()
        
        info_sections = soup.find_all('div')
        for section in info_sections:
            label = section.find('p', class_="ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3")
            if label:
                if label.text == "Batting Style":
                    batting_style = section.find('span', class_="ds-text-title-s ds-font-bold ds-text-typo").text.strip()
                elif label.text == "Bowling Style":
                    bowling_style = section.find('span', class_="ds-text-title-s ds-font-bold ds-text-typo").text.strip()
                elif label.text == "Playing Role":
                    playing_role = section.find('span', class_="ds-text-title-s ds-font-bold ds-text-typo").text.strip()
        
        teams_section = soup.find('div', class_="ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-y-4")
        if teams_section:
            team_links = teams_section.find_all('a', class_="ds-flex ds-items-center ds-space-x-4")
            for team_link in team_links:
                title = team_link.get('title', '')
                team_name = title.split("'s ", 1)[1].strip()
                if team_name.endswith(" team profile"):
                    team_name = team_name[:-13]
                if team_name:
                    teams.append(team_name)

        with counter_lock:
            counter += 1
            print(f"Progress: {counter}/{total_players} players processed.")  
        # print(full_name, batting_style, bowling_style, playing_role, teams)         
        return cricinfo_id, full_name, batting_style, bowling_style, playing_role, teams
    else:
        return cricinfo_id, None, None, None, None, []

def run_scraper_parallel(data, total_players, max_workers):
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_id = {
            executor.submit(get_player_details, row['key_cricinfo'], total_players): row['key_cricinfo']
            for _, row in data.iterrows()
        }
        
        for future in as_completed(future_to_id):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error occurred: {e}")
    
    return results

In [52]:


def adding_names():
    global counter

    data = pd.read_csv(this_file_dir + '../data/raw/cricksheet/people.csv')
    total_players = len(data)
    counter = 0 
    data = data[:1]
    # print(data)
    scraped_data = run_scraper_parallel(data, total_players, max_workers=30)

    scraped_df = pd.DataFrame(scraped_data, columns=['key_cricinfo', 'full_name', 'batting_style', 'bowling_style', 'playing_role', 'teams'])

    data = data.merge(scraped_df, on='key_cricinfo', how='left')

    data = data.rename(columns={"identifier": "player_id"})

    input_data = pd.read_csv(this_file_dir + '../data/interim/mw_pw.csv')
    final_data = input_data.merge(data, on='player_id', how='left')

    # final_data.to_csv(this_file_dir + '../data/interim/mw_pw_profiles.csv', index=False)

    print("Player data updated successfully with parallel scraping.")
    return final_data

In [53]:
d = adding_names()

Processing player with cricinfo_id: 772407
200
Progress: 1/16111 players processed.
Player data updated successfully with parallel scraping.


In [54]:
d[d['player_id']=='b4a23876']

Unnamed: 0.1,Unnamed: 0,player_id,match_id,gender,balls_per_over,start_date,series_name,match_type,name_x,runs_scored,...,key_nvplay_2,key_opta,key_opta_2,key_pulse,key_pulse_2,full_name,batting_style,bowling_style,playing_role,teams
281938,281938,b4a23876,1062502,male,6,2016-10-30,ICC World Cricket League Division Four,ODM,AAA Amsterdam,27,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281939,281939,b4a23876,1062504,male,6,2016-11-01,ICC World Cricket League Division Four,ODM,AAA Amsterdam,12,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281940,281940,b4a23876,1062506,male,6,2016-11-02,ICC World Cricket League Division Four,ODM,AAA Amsterdam,22,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281941,281941,b4a23876,1062511,male,6,2016-11-04,ICC World Cricket League Division Four,ODM,AAA Amsterdam,102,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281942,281942,b4a23876,1062514,male,6,2016-11-05,ICC World Cricket League Division Four,ODM,AAA Amsterdam,50,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281943,281943,b4a23876,1090943,male,6,2017-05-26,ICC World Cricket League Division Three,ODM,AAA Amsterdam,41,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281944,281944,b4a23876,1090953,male,6,2017-05-30,ICC World Cricket League Division Three,ODM,AAA Amsterdam,24,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281945,281945,b4a23876,875477,male,6,2015-07-12,ICC World Twenty20 Qualifier,IT20,AAA Amsterdam,16,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281946,281946,b4a23876,875503,male,6,2015-07-15,ICC World Twenty20 Qualifier,IT20,AAA Amsterdam,0,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
281947,281947,b4a23876,875531,male,6,2015-07-18,ICC World Twenty20 Qualifier,IT20,AAA Amsterdam,43,...,,,,,,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter,"[United States of America cricket, New York Re..."
