## Scrapping Players Details for T20 Men's Cricket WorldCup 2022

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
def page_content_getter(URL:str)-> bs:
    try:
        page_content = requests.get(URL).content
        soup = bs(page_content,"html.parser")
        return soup
    except Exception as e:
        print("Error occured: ",e)

In [3]:
def image_retriever(player_name:str):
  player_search_name = "+".join(player_name.split())
  URL = f"https://www.google.com/search?q={player_search_name}+%3A\
  +cricbuzz+profile&tbm=isch&ved=2ahUKEwibjOy215H8AhX0jNgFHfHLCpwQ2\
  -cCegQIABAA&oq=Ben+Stokes+%3A+cricbuzz+profile&gs_lcp=CgNpbWcQAzIHCAAQgAQQGFCgDFj\
  YJGDXKGgAcAB4AIAB7gGIAdkPkgEFMC45LjKYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&\
  ei=qqOmY5uPKfSZ4t4P8Zer4Ak&bih=657&biw=1366&rlz=1C1CHBF_enIN1009IN1013"
  page = page_content_getter(URL)
  image = page.findAll("img")[1].get("src")
  return image

In [4]:
def get_players_attributes(player:bs)->dict:
  attr = dict()
  box = player.findAll("div",{"class":"ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8"})
  inner_box = box[0].findAll("div")
  for info in inner_box:
    attr[info.p.text] = info.span.h5.text
    if info.p.text == "Playing Role":
      break
  return attr

In [5]:
URL = "https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/squads"
home_page = page_content_getter(URL)

#### Getting links for each team squad

In [6]:
squad_links = home_page.findAll("div",{"class":"ds-mb-4"})[0].findAll("a")

In [7]:
prefix = "https://www.espncricinfo.com"
players_url=[]
print(f"{'>'*20} Getting links for each squads in the WorldCup {'<'*20}")
print(f"{'_'*60}")
for url in tqdm(squad_links):
    ur = prefix+url.get("href")
    squad = page_content_getter(ur).findAll("div",{"class":"ds-relative ds-flex ds-flex-row ds-space-x-4 ds-p-4 lg:ds-px-6"})
    for link in squad:
        players_url.append(prefix+link.a.get("href"))
        

>>>>>>>>>>>>>>>>>>>> Getting links for each squads in the WorldCup <<<<<<<<<<<<<<<<<<<<
____________________________________________________________


100%|██████████| 16/16 [00:06<00:00,  2.40it/s]


#### Collecting details for each player in the WorldCup Squad

In [8]:
 def details_fetcher(player:bs)->tuple:
    name1 = player.findAll("h1",{"class":"ds-text-title-l ds-font-bold"})[0].text
    team1 = player.findAll("span", {"class":"ds-text-comfortable-s"})[0].text
    
    attr = get_players_attributes(player)
    play_role1 = attr["Playing Role"]
    bat_style1 = attr["Batting Style"]
    bowl_style1 = attr["Bowling Style"] if "Bowling Style" in attr.keys() else ''
    
    description1 = player.findAll("div",{"class":"ci-player-bio-content"})
    if len(description1) != 0:
        description1 = description1[0].p.text
    else:
        description1 = ''
    return (name1, team1, bat_style1, bowl_style1, play_role1, description1)

In [None]:
name=[]
team=[]
image=[]
bat_style=[]
bowl_style=[]
play_role=[]
description=[]

print(f"{'>'*20} Scrapping players details {'<'*20}")
print(f"{'_'*60}")
for link in tqdm(players_url):
    try: 
       player=page_content_getter(link)
       details = details_fetcher(player)
       name.append(details[0])
       image.append(image_retriever(details[0]))
       team.append(details[1])
       bat_style.append(details[2])
       bowl_style.append(details[3])
       play_role.append(details[4])
       description.append(details[5])
    except Exception as e:
        print(f"Error in {link} \nError:{e}")
        continue
print("Data Scarpped successfully!!!")
data_dict ={
    "Name": name,
    "Image": image,
    "Team": team,
    "Batting_Style": bat_style,
    "Bowling_Style": bowl_style,
    "Playing_Role": play_role,
    "Description": description
}

print("Saving data into Dataframe........")
players_details = pd.DataFrame(data_dict)

>>>>>>>>>>>>>>>>>>>> Scrapping players details <<<<<<<<<<<<<<<<<<<<
____________________________________________________________


100%|██████████| 254/254 [03:37<00:00,  1.17it/s]

Data Scarpped successfully!!!
Saving data into Dataframe........





In [None]:
players_details.to_csv("dim_players_details.csv",index=None)

In [None]:
players_details[players_details["Team"]=="Australia"]

Unnamed: 0,Name,Image,Team,Batting_Style,Bowling_Style,Playing_Role,Description
16,Aaron Finch,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Slow Left arm Orthodox,Top order Batter,"A solidly built, aggressive batter from a coun..."
17,Pat Cummins,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Fast,Bowler,Earmarked as a potential star from a young age...
18,Ashton Agar,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Left hand Bat,Slow Left arm Orthodox,Bowler,Ashton Agar initially turned almost as many he...
19,Tim David,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Offbreak,Middle order Batter,Mumbai Indians' decision to sign Tim David for...
20,Cameron Green,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Fast medium,Batting Allrounder,Contracted to Western Australia as a schoolboy...
21,Josh Hazlewood,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Left hand Bat,Right arm Fast medium,Bowler,A tall and accurate fast bowler who is unrelen...
22,Mitchell Marsh,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Medium,Allrounder,Part of one of the most well-known family name...
23,Glenn Maxwell,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Offbreak,Batting Allrounder,"One of the fastest scorers in world cricket, G..."
24,Kane Richardson,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Right arm Fast medium,Bowler,A pace bowler who has often been on the fringe...
25,Steven Smith,https://encrypted-tbn0.gstatic.com/images?q=tb...,Australia,Right hand Bat,Legbreak Googly,Middle order Batter,"In a career of twists and turns, Steven Smith ..."


## Player Career Details Scrapping

In [9]:
def career_details(URL:str)->dict:
  final_dict = dict()
  player=page_content_getter(URL)
  box1 = player.findAll("div",{"class":"ds-p-0"})
  box2 = box1[5].findAll("div")
  name = player.findAll("h1",{"class":"ds-text-title-l ds-font-bold"})[0].text

  stats_type = box2[0].h5.text.split()[0]
  df1 = pd.read_html(str(box2[1]))[0]
  df1_dict = df1.set_index("Format").T.to_dict()
  final_dict[stats_type]= df1_dict["T20I"] if "T20I" in df1_dict.keys() else "Debut"

  stats_type = box2[2].h5.text.split()[0]
  df2 = pd.read_html(str(box2[2]))[0]
  df2_dict = df2.set_index("Format").T.to_dict()
  final_dict[stats_type]= df2_dict["T20I"] if "T20I" in df1_dict.keys() else "Debut"
  return name, final_dict

In [10]:
career_bat = dict()
career_bowl = dict()
for player in tqdm(players_url):
  career_bat[career_details(player)[0]] = career_details(player)[1]["Batting"]
  career_bowl[career_details(player)[0]] = career_details(player)[1]["Bowling"]

100%|██████████| 254/254 [04:03<00:00,  1.04it/s]


In [11]:
bat_career_df = pd.DataFrame(career_bat).T.reset_index()
bat_career_df = bat_career_df.rename(columns={"index":"Name"})
bat_career_df.drop(columns=["Ct","St","BF"],inplace=True)

In [12]:
bowl_career_df = pd.DataFrame(career_bowl).T.reset_index()
bowl_career_df = bowl_career_df.rename(columns={"index":"Name"})
bowl_career_df.drop(columns=["BBI","10w"],inplace=True)

In [13]:
bat_career_df.to_csv("player_bat_career.csv", index=None)
bowl_career_df.to_csv("player_bowl_career.csv", index=None)