# Data Scrapping

In [236]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from tqdm import tqdm

### Scrapping fixtures and winners data

In [2]:
URL = r"https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament"

In [3]:
page = requests.get(URL)
page

<Response [200]>

In [4]:
page_content = page.content

In [5]:
soup = bs(page_content,"html.parser")

In [6]:
table_data = soup.findAll("tr",{"class":"data1"})

In [7]:
team1=[]
team2=[]
winner=[]
margin=[]
ground=[]
match_date=[]
match_id=[]
score_card_url=[]

In [8]:
for i in table_data:
    data = i.text.split("\n")
    team1.append(data[1])
    team2.append(data[2])
    winner.append(data[3])
    margin.append(data[4])
    ground.append(data[5])
    match_date.append(data[6])
    match_id.append(data[7])
    score_card_url.append(i.findAll("td")[-1].a.get("href"))

In [9]:
table_data[0].findAll("td")[-1].a.get("href")

'/ci/engine/match/1298135.html'

In [10]:
data_dict = {
    "Match_Date" : match_date,
    "Match_Id" : match_id,
    "Team_1" : team1,
    "Team_2" : team2,
    "Winner" : winner,
    "Margin" : margin,
    "Ground" : ground
}



t20_df = pd.DataFrame(data_dict)

In [11]:
t20_df

Unnamed: 0,Match_Date,Match_Id,Team_1,Team_2,Winner,Margin,Ground
0,"Oct 16, 2022",T20I # 1823,Namibia,Sri Lanka,Namibia,55 runs,Geelong
1,"Oct 16, 2022",T20I # 1825,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong
2,"Oct 17, 2022",T20I # 1826,Scotland,West Indies,Scotland,42 runs,Hobart
3,"Oct 17, 2022",T20I # 1828,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart
4,"Oct 18, 2022",T20I # 1830,Namibia,Netherlands,Netherlands,5 wickets,Geelong
5,"Oct 18, 2022",T20I # 1832,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong
6,"Oct 19, 2022",T20I # 1833,Ireland,Scotland,Ireland,6 wickets,Hobart
7,"Oct 19, 2022",T20I # 1834,West Indies,Zimbabwe,West Indies,31 runs,Hobart
8,"Oct 20, 2022",T20I # 1835,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong
9,"Oct 20, 2022",T20I # 1836,Namibia,U.A.E.,U.A.E.,7 runs,Geelong


In [35]:
t20_df.to_csv("t20_df.csv", index = False)

In [40]:
df = pd.read_html(URL)

In [44]:
df[0]

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830
5,Sri Lanka,U.A.E.,Sri Lanka,79 runs,Geelong,"Oct 18, 2022",T20I # 1832
6,Ireland,Scotland,Ireland,6 wickets,Hobart,"Oct 19, 2022",T20I # 1833
7,West Indies,Zimbabwe,West Indies,31 runs,Hobart,"Oct 19, 2022",T20I # 1834
8,Netherlands,Sri Lanka,Sri Lanka,16 runs,Geelong,"Oct 20, 2022",T20I # 1835
9,Namibia,U.A.E.,U.A.E.,7 runs,Geelong,"Oct 20, 2022",T20I # 1836


### Scorecard Scrapping for each match

In [12]:
prefix = "https://stats.espncricinfo.com"
score_card_url[0]

'/ci/engine/match/1298135.html'

In [48]:
scorecard_url = prefix+score_card_url[1]

In [49]:
sd_df = pd.read_html(scorecard_url)

In [50]:
score_page = requests.get(scorecard_url).content
soup2 = bs(score_page, "html.parser")

#### Batting scorecard scrapper

In [339]:
#d = soup2.findAll("div",{"class":"ds-w-full ds-bg-fill-content-prime ds-overflow-hidden ds-rounded-xl ds-border ds-border-line ds-mb-4"}
def get_match_scores(div_tag): 
    match=[]
    team=[]
    bat_name=[]
    bat_pos=[]
    dismissal=[]
    runs=[]
    balls =[]
    fours =[]
    sixes=[]
    strike_rate=[]

    for current_team in [0,1]:
        pos=0
        for idx, batter in enumerate(div_tag[current_team].findAll("tr")):
            if idx!=0:   
                details = batter.findAll("td")
                if details[0].text =='':
                    continue
                elif details[0].text == 'Extras':
                    break
                else:
                    pos=pos+1
                    team.append(div_tag[current_team].div.div.span.span.text)
                    bat_pos.append(pos)
                    bat_name.append(details[0].text)
                    dismissal.append(details[1].text)
                    runs.append(details[2].text)
                    balls.append(details[3].text)
                    fours.append(details[5].text)
                    sixes.append(details[6].text)
                    strike_rate.append(details[7].text)
                    
    df = pd.DataFrame({
        "Team_Innings":team,
        "Batting_Pos":bat_pos,
        "Batsman":bat_name,
        "Dismissal":dismissal,
        "Runs":runs,
        "Balls":balls,
        "4s":fours,
        "6s":sixes,
        "SR":strike_rate
    })
    df["Match"] = f"{df.Team_Innings.unique()[0]} vs {df.Team_Innings.unique()[1]}"
    df = df.loc[:,["Match","Team_Innings","Batting_Pos","Batsman","Dismissal","Runs","Balls","4s","6s","SR"]]
    return df

In [343]:
def get_bowl_stats(url, div_tag):
    team1 = div_tag[1].div.div.span.span.text
    team2 = div_tag[0].div.div.span.span.text
    
    stats = pd.read_html(url)
    idx_list1=[row[0] for row in enumerate(stats[1].iterrows()) if len(row[1][1][0])>30]
    card_1 = stats[1].drop(index=idx_list1)
    card_1["Bowling_Team"] = team1
    
    idx_list2=[row[0] for row in enumerate(stats[3].iterrows()) if len(row[1][1][0])>30]
    card_2 = stats[3].drop(index=idx_list2)
    card_2["Bowling_Team"] =team2
    
    data = pd.concat([card_1,card_2],axis=0, ignore_index=True)
    data["Match"] = f"{team2} vs {team1}"
    
    return data

In [344]:
bat_fact_df = pd.DataFrame()
bowl_fact_df = pd.DataFrame()
prefix = "https://stats.espncricinfo.com"

print(f"{'>'*10} Collecting All Scorecards for T20 Men's Cricket Worldcup {'<'*10}")
print("-"*30)
for url in tqdm(score_card_url):
    try: 
        scorecard_url = prefix+url
        score_page = requests.get(scorecard_url).content
        soup2 = bs(score_page, "html.parser")

        div_tag = soup2.findAll("div",{"class":"ds-w-full ds-bg-fill-content-prime ds-overflow-hidden ds-rounded-xl ds-border ds-border-line ds-mb-4"})

        df = get_match_scores(div_tag)
        bat_fact_df = pd.concat([bat_fact_df,df],axis=0,ignore_index=True)
        
        bowl_df = get_bowl_stats(scorecard_url, div_tag)
        bowl_fact_df = pd.concat([bowl_fact_df,bowl_df],axis=0,ignore_index=True)
        bowl_fact_df = bowl_fact_df.rename(columns={"BOWLING":"bowler","O":"overs","M":"maidens","R":"runs","W":"wickets","ECON":"economy"})
        bowl_fact_df = bowl_fact_df.loc[:,["Match","Bowling_Team","bowler","overs", "maidens","runs","wickets","economy","0s","4s","6s","WD","NB"]]
    except Exception as e:
        continue
print("Scrapping Successfull!!! \n Saved DataFrame!!!")  

>>>>>>>>>> Collecting All Scorecards for T20 Men's Cricket Worldcup <<<<<<<<<<
------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [02:07<00:00,  2.84s/it]

Scrapping Successfull!!! 
 Saved DataFrame!!!





In [295]:
bat_fact_df.to_csv("t20_bat_summary.csv",index=None)
bowl_fact_df.to_csv("t20_bowl_fact_summary.csv",index=None)