# **WEB SCRAPPING**

In [None]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import csv

## **Scrapping Match Results**

In [None]:
url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
req = requests.get(url)

In [None]:
soup1 = BeautifulSoup(req.content, "html.parser")

### Getting Header

In [None]:
header = soup1.thead
print(type(header))
print(header.get_text())

<class 'bs4.element.Tag'>


Team 1
Team 2
Winner
Margin
Ground
Match Date
Scorecard




In [None]:
# Split multiple lines into lines and store those lines into header_list
header_list = header.get_text().splitlines()
print(header_list)

['', '', 'Team 1', 'Team 2', 'Winner', 'Margin', 'Ground', 'Match Date', 'Scorecard', '']


In [None]:
# Removing spaces lines from the header_lines list
while "" in header_list:
    header_list.remove("")

print(header_list)

['Team 1', 'Team 2', 'Winner', 'Margin', 'Ground', 'Match Date', 'Scorecard']


### Getting Table

In [None]:
body = soup1.tbody

# strip('\n) will remove the starting and ending all \n from the string
data = body.get_text().strip('\n')
print(data)
print(type(data))

Namibia
Sri Lanka
Namibia
55 runs
Geelong
Oct 16, 2022
T20I # 1823


Netherlands
U.A.E.
Netherlands
3 wickets
Geelong
Oct 16, 2022
T20I # 1825


Scotland
West Indies
Scotland
42 runs
Hobart
Oct 17, 2022
T20I # 1826


Ireland
Zimbabwe
Zimbabwe
31 runs
Hobart
Oct 17, 2022
T20I # 1828


Namibia
Netherlands
Netherlands
5 wickets
Geelong
Oct 18, 2022
T20I # 1830


Sri Lanka
U.A.E.
Sri Lanka
79 runs
Geelong
Oct 18, 2022
T20I # 1832


Ireland
Scotland
Ireland
6 wickets
Hobart
Oct 19, 2022
T20I # 1833


West Indies
Zimbabwe
West Indies
31 runs
Hobart
Oct 19, 2022
T20I # 1834


Netherlands
Sri Lanka
Sri Lanka
16 runs
Geelong
Oct 20, 2022
T20I # 1835


Namibia
U.A.E.
U.A.E.
7 runs
Geelong
Oct 20, 2022
T20I # 1836


Ireland
West Indies
Ireland
9 wickets
Hobart
Oct 21, 2022
T20I # 1837


Scotland
Zimbabwe
Zimbabwe
5 wickets
Hobart
Oct 21, 2022
T20I # 1838


Australia
New Zealand
New Zealand
89 runs
Sydney
Oct 22, 2022
T20I # 1839


Afghanistan
England
England
5 wickets
Perth
Oct 22, 2022
T20I # 18

In [None]:
row_list = data.split('\n\n\n')
row_list

['Namibia\nSri Lanka\nNamibia\n55 runs\nGeelong\nOct 16, 2022\nT20I # 1823',
 'Netherlands\nU.A.E.\nNetherlands\n3 wickets\nGeelong\nOct 16, 2022\nT20I # 1825',
 'Scotland\nWest Indies\nScotland\n42 runs\nHobart\nOct 17, 2022\nT20I # 1826',
 'Ireland\nZimbabwe\nZimbabwe\n31 runs\nHobart\nOct 17, 2022\nT20I # 1828',
 'Namibia\nNetherlands\nNetherlands\n5 wickets\nGeelong\nOct 18, 2022\nT20I # 1830',
 'Sri Lanka\nU.A.E.\nSri Lanka\n79 runs\nGeelong\nOct 18, 2022\nT20I # 1832',
 'Ireland\nScotland\nIreland\n6 wickets\nHobart\nOct 19, 2022\nT20I # 1833',
 'West Indies\nZimbabwe\nWest Indies\n31 runs\nHobart\nOct 19, 2022\nT20I # 1834',
 'Netherlands\nSri Lanka\nSri Lanka\n16 runs\nGeelong\nOct 20, 2022\nT20I # 1835',
 'Namibia\nU.A.E.\nU.A.E.\n7 runs\nGeelong\nOct 20, 2022\nT20I # 1836',
 'Ireland\nWest Indies\nIreland\n9 wickets\nHobart\nOct 21, 2022\nT20I # 1837',
 'Scotland\nZimbabwe\nZimbabwe\n5 wickets\nHobart\nOct 21, 2022\nT20I # 1838',
 'Australia\nNew Zealand\nNew Zealand\n89 runs

In [None]:
final_rows = []
for row in row_list:
  # This will remove the leading and trailing of each string in the list.
  row = row.strip('\n')

  temp_list = row.split('\n')
  final_rows.append(temp_list)

print(final_rows)

[['Namibia', 'Sri Lanka', 'Namibia', '55 runs', 'Geelong', 'Oct 16, 2022', 'T20I # 1823'], ['Netherlands', 'U.A.E.', 'Netherlands', '3 wickets', 'Geelong', 'Oct 16, 2022', 'T20I # 1825'], ['Scotland', 'West Indies', 'Scotland', '42 runs', 'Hobart', 'Oct 17, 2022', 'T20I # 1826'], ['Ireland', 'Zimbabwe', 'Zimbabwe', '31 runs', 'Hobart', 'Oct 17, 2022', 'T20I # 1828'], ['Namibia', 'Netherlands', 'Netherlands', '5 wickets', 'Geelong', 'Oct 18, 2022', 'T20I # 1830'], ['Sri Lanka', 'U.A.E.', 'Sri Lanka', '79 runs', 'Geelong', 'Oct 18, 2022', 'T20I # 1832'], ['Ireland', 'Scotland', 'Ireland', '6 wickets', 'Hobart', 'Oct 19, 2022', 'T20I # 1833'], ['West Indies', 'Zimbabwe', 'West Indies', '31 runs', 'Hobart', 'Oct 19, 2022', 'T20I # 1834'], ['Netherlands', 'Sri Lanka', 'Sri Lanka', '16 runs', 'Geelong', 'Oct 20, 2022', 'T20I # 1835'], ['Namibia', 'U.A.E.', 'U.A.E.', '7 runs', 'Geelong', 'Oct 20, 2022', 'T20I # 1836'], ['Ireland', 'West Indies', 'Ireland', '9 wickets', 'Hobart', 'Oct 21, 2022

### Moving this data to excel

In [None]:
# Open the file in write mode
with open('MatchResults.csv', mode='w', newline='') as file:
  
  # Create a writer object
  writer = csv.writer(file)

  # Write header to the file
  writer.writerow(header_list)

  # Here data is the row that will be written.
  for row in final_rows:
    writer.writerow(row)
    

## **Scrapping Match Summary**

In [None]:
with open('Match_score.csv', mode='w', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Here data is the row that will be written.
    writer.writerow(['Team', 'Score'])

Getting data

In [None]:
# We found the pattern in the link to the data, so we take it as unique number and increment to find all data
j = -1      # For appending the match id to the record as well.
unique_number = 1298135
Links=[]   # Storing Storecard data column from matchresults to check for if it ends with alphabet
for row in soup1.tbody.findAll('tr'):         # If it ends with alphabet, then scorecard won't be there because it's abandoned.
    Links.append(row.find_all('td')[-1].text)
print(len(Links))
print(Links)

for i in range(45):
  # Getting links one by one
  link = 'https://stats.espncricinfo.com/ci/engine/match/'+str(unique_number)+'.html'
  j+=1
  unique_number += 1

  # if Links ends with a alphabet then we don't move to scorecard as match was abandoned.
  if re.search('[a-z]$', Links[i]):
    continue
  
  req = requests.get(link)
  soup_a = BeautifulSoup(req.text, "lxml")

  rec1 = []
  c = soup_a.find('div', class_='ds-flex ds-flex-col ds-mt-3 md:ds-mt-0 ds-mt-0 ds-mb-1')  
  for cd in c.find_all('strong'):
      rec1.append(cd.text)
  print(rec1)

  rec2 = []
  d = soup_a.find_all('span', class_ = 'ds-text-tight-l ds-font-bold ds-text-typo hover:ds-text-typo-primary ds-block ds-truncate')  
  for c in d[:2]:
    rec2.append(c.text)
  print(rec2)


  with open('Match_score.csv', mode='a', newline='') as file:
    
    # Create a writer object
    writer = csv.writer(file)

    # Here data is the row that will be written.
    writer.writerow([rec2[0], rec1[0]])
    writer.writerow([rec2[1], rec1[1]])
    


45
['T20I # 1823', 'T20I # 1825', 'T20I # 1826', 'T20I # 1828', 'T20I # 1830', 'T20I # 1832', 'T20I # 1833', 'T20I # 1834', 'T20I # 1835', 'T20I # 1836', 'T20I # 1837', 'T20I # 1838', 'T20I # 1839', 'T20I # 1840', 'T20I # 1841', 'T20I # 1842', 'T20I # 1843', 'T20I # 1844', 'T20I # 1845', 'T20I # 1846', 'T20I # 1846a', 'T20I # 1847', 'T20I # 1848', 'T20I # 1849', 'T20I # 1849a', 'T20I # 1849b', 'T20I # 1850', 'T20I # 1851', 'T20I # 1852', 'T20I # 1853', 'T20I # 1855', 'T20I # 1856', 'T20I # 1858', 'T20I # 1859', 'T20I # 1860', 'T20I # 1861', 'T20I # 1862', 'T20I # 1864', 'T20I # 1867', 'T20I # 1871', 'T20I # 1872', 'T20I # 1873', 'T20I # 1877', 'T20I # 1878', 'T20I # 1879']
['163/7', '108']
['Namibia', 'Sri Lanka']
['111/8', '112/7']
['United Arab Emirates', 'Netherlands']
['160/5', '118']
['Scotland', 'West Indies']
['174/7', '143/9']
['Zimbabwe', 'Ireland']
['121/6', '122/5']
['Namibia', 'Netherlands']
['152/8', '73']
['Sri Lanka', 'United Arab Emirates']
['176/5', '180/4']
['Scotland

In [None]:
url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/south-africa-vs-zimbabwe-18th-match-group-2-1298152/full-scorecard'
req = requests.get(url)


In [None]:
soup_a = BeautifulSoup(req.text, "lxml")

rec1 = []
c = soup_a.find('div', class_='ds-flex ds-flex-col ds-mt-3 md:ds-mt-0 ds-mt-0 ds-mb-1')  
for cd in c.find_all('strong'):
    rec1.append(cd.text)
print(rec1)

rec2 = []
d = soup_a.find_all('span', class_ = 'ds-text-tight-l ds-font-bold ds-text-typo hover:ds-text-typo-primary ds-block ds-truncate')  
for c in d[:2]:
  rec2.append(c.text)
print(rec2)


with open('Match_score.csv', mode='a', newline='') as file:
  
  # Create a writer object
  writer = csv.writer(file)

  # Here data is the row that will be written.
  writer.writerow([rec2[0], rec1[0]])
  writer.writerow([rec2[0], rec1[0]])
  


['79/5', '51/0']
['Zimbabwe', 'South Africa']


AttributeError: ignored

## **Scrapping Batting Summary**

In [None]:
# We get the length of number of records in our match results, because we'll access the match data with
# those results so we should know the length to iterate that much only
print(len(final_rows))

45


In [None]:
url1 = 'https://stats.espncricinfo.com/ci/engine/match/1298135.html'
req = requests.get(url1)

In [None]:
soup3 = BeautifulSoup(req.text, "lxml")

### Getting Header

In [None]:
header = soup3.table.find_all("th")
header_list = [name.text for name in header]
header_list[1] = 'dismissal'
header_list.insert(0, 'match')
header_list.insert(1, 'teamInning')
header_list.insert(2, 'battingPosition')
header_list.append('match_id')

header_list[3]='batsman'
with open('Batsman_innings.csv', mode='w', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Write header to the file
    writer.writerow(header_list)  

### Getting Body and storing to **excel**

In [None]:
# We found the pattern in the link to the data, so we take it as unique number and increment to find all data
j = -1      # For appending the match id to the record as well.
unique_number = 1298135
Links=[]   # Storing Storecard data column from matchresults to check for if it ends with alphabet
for row in soup1.tbody.findAll('tr'):         # If it ends with alphabet, then scorecard won't be there because it's abandoned.
    Links.append(row.find_all('td')[-1].text)
print(len(Links))
print(Links)

for i in range(45):
  # Getting links one by one
  link = 'https://stats.espncricinfo.com/ci/engine/match/'+str(unique_number)+'.html'
  j+=1
  unique_number += 1

  # if Links ends with a alphabet then we don't move to scorecard as match was abandoned.
  if re.search('[a-z]$', Links[i]):
    continue
  
  req = requests.get(link)
  soup4 = BeautifulSoup(req.text, "lxml")

  # finding title, only two entries will be returned as 1st inning team name and 2nd inning team name
  title = soup4.find_all('span', class_ = 'ds-text-title-xs ds-font-bold ds-capitalize')
  match_name = title[0].text + ' Vs ' + title[1].text 
  
  rows = soup4.find_all('table')[0].tbody.find_all('tr')
  inning1_data = []
  # Getting row data from inning1 table, excluding last 4 values as they were not batting data
  for row in rows[:-4]:
    row_data = []
    
    for data in row:
      # Inserting the team
      row_data.append(data.get_text())
    
    # All data for 1st inning will be stored in inning1_data
    inning1_data.append(row_data)
  
  while [''] in inning1_data:
    inning1_data.remove([''])
  
  # Putting each data values 1st three values as match_name, team_Innings, Batsman_number
  n = 1
  for i in range(len(inning1_data)):
    inning1_data[i].insert(0, match_name)
    inning1_data[i].insert(1, title[0].text)
    inning1_data[i].insert(2, n)
    n+=1
    inning1_data[i].append(Links[j])

  # All data for 2nd inning will be stored in inning2_data    
  rows = soup4.find_all('table')[2].tbody.find_all('tr')
  inning2_data = []

  for row in rows[:-3]:
    row_data = []
    
    for data in row:
      row_data.append(data.get_text().strip())
    
    inning2_data.append(row_data)

  while [''] in inning2_data:
    inning2_data.remove([''])
  
  # If 2nd inning any last record is extras, so remove it.
  n = 1
  if re.search("^Extras", inning2_data[-1][0]):
    inning2_data.pop()
  for i in range(len(inning2_data)):
    inning2_data[i].insert(0, match_name)
    inning2_data[i].insert(1, title[1].text)
    inning2_data[i].insert(2, n)  
    n+=1
    inning2_data[i].append(Links[j])

  # Adding both inning data
  final_batting_data = inning1_data + inning2_data

  with open('Batsman_innings.csv', mode='a', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Here data is the row that will be written.
    for row in final_batting_data:
      writer.writerow(row)
  

45
['T20I # 1823', 'T20I # 1825', 'T20I # 1826', 'T20I # 1828', 'T20I # 1830', 'T20I # 1832', 'T20I # 1833', 'T20I # 1834', 'T20I # 1835', 'T20I # 1836', 'T20I # 1837', 'T20I # 1838', 'T20I # 1839', 'T20I # 1840', 'T20I # 1841', 'T20I # 1842', 'T20I # 1843', 'T20I # 1844', 'T20I # 1845', 'T20I # 1846', 'T20I # 1846a', 'T20I # 1847', 'T20I # 1848', 'T20I # 1849', 'T20I # 1849a', 'T20I # 1849b', 'T20I # 1850', 'T20I # 1851', 'T20I # 1852', 'T20I # 1853', 'T20I # 1855', 'T20I # 1856', 'T20I # 1858', 'T20I # 1859', 'T20I # 1860', 'T20I # 1861', 'T20I # 1862', 'T20I # 1864', 'T20I # 1867', 'T20I # 1871', 'T20I # 1872', 'T20I # 1873', 'T20I # 1877', 'T20I # 1878', 'T20I # 1879']


### Parts of above code to understand

In [None]:
# Finding the team in batting order. Namibia batted first and Sri lanka second
title = soup1.find_all('span', class_ = 'ds-text-title-xs ds-font-bold ds-capitalize')
# title[0].text
for i in title:
  print(i.text)

In [None]:
# Finding the titles, i.e. the team name from each inning table
title2 = soup1.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-capitalize')
title2.text

In [None]:
url1 = 'https://stats.espncricinfo.com/ci/engine/match/1298135.html'
req = requests.get(url1)

In [None]:
soup4 = BeautifulSoup(req.text, "lxml")

In [None]:
header = soup4.table.find_all("th")
header_list = [name.text for name in header]
header_list.remove('\xa0')
header_list

In [None]:
rows = soup4.find_all('table')[0].tbody.find_all('tr')
inning1_data = []
for row in rows[:-4]:
  row_data = []
  for data in row:
    row_data.append(data.get_text())
  inning1_data.append(row_data)
print(inning1_data)

In [None]:
rows = soup4.find_all('table')[2].tbody.find_all('tr')
inning2_data = []
for row in rows[:-3]:
  row_data = []
  for data in row:
    row_data.append(data.get_text().strip())
  inning2_data.append(row_data)
print(inning2_data)


In [None]:
while [""] in inning1_data:
  inning1_data.remove([""])

while [""] in inning2_data:
  inning2_data.remove([""])

In [None]:
final_batting_data = inning1_data + inning2_data
print(final_batting_data)

In [None]:
df = pd.DataFrame([[td.text for td in row.findAll('td')] for row in soup3.tbody.findAll('tr')])
df

In [None]:
url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
req = requests.get(url)

In [None]:
soup1 = BeautifulSoup(req.content, "html.parser")

In [None]:
import re

In [None]:
for row in soup1.tbody.findAll('tr'):
    data = row.find_all('td')[-1].text
    if re.search('[a-z]', data):
      print(data)

In [None]:
a = '24321a'
if re.search('[a-z]$',a):
  print('Yes')

## **Scrapping Bowling Summary**

### Getting Header

In [None]:
url1 = 'https://stats.espncricinfo.com/ci/engine/match/1298135.html'
req = requests.get(url1)

soup5 = BeautifulSoup(req.text, "lxml")

header = soup5.find_all('table')[1].find_all("th")
header_list = [name.text for name in header]

header_list.insert(0, 'natch')
header_list.insert(1, 'bowlingTeam')
header_list.append('match_id')

with open('Bowling_innings.csv', mode='w', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Write header to the file
    writer.writerow(header_list)  

### Getting Body and storing to **excel**

In [None]:
unique_number = 1298135
j = -1
Links=[]   # Storing Storecard data column from matchresults to check for if it ends with alphabet
for row in soup1.tbody.findAll('tr'):         # If it ends with alphabet, then scorecard won't be there because it's abandoned.
    Links.append(row.find_all('td')[-1].text)
print(len(Links))

for i in range(45):
  # Getting links one by one
  link = 'https://stats.espncricinfo.com/ci/engine/match/'+str(unique_number)+'.html'

  j+=1
  unique_number += 1

  # if Links ends with a alphabet then we don't move to scorecard as match was abandoned.
  if re.search('[a-z]$', Links[i]):
    continue

  req = requests.get(link)
  soup5 = BeautifulSoup(req.text, "lxml")
  
  # finding title, only two entries will be returned as 1st inning team name and 2nd inning team name
  title = soup5.find_all('span', class_ = 'ds-text-title-xs ds-font-bold ds-capitalize')
  match_name = title[0].text + ' Vs ' + title[1].text 


  # Locate the rows of the the 1st inning bowling table
  rows = soup5.find_all('table')[1].tbody.find_all('tr')

  # Create a list to store lists of lists, i.e. rows as a list.
  Inning1_bowl = []

  # Looping through each row
  for row in rows:
    # Create row_data to record each data entry in a list as a separate and then append this list to Inning1_bowl
    row_data=[]
    # Looping through each data point
    for data in row:
      row_data.append(data.text)
    
    Inning1_bowl.append(row_data)

  # Removing the commentary data present in unexpected rows by telling that remove those rows whose total data entry is not equal to 11.
  # This will remove Comments as their length is 1
  for row in Inning1_bowl:
    if len(row)!=11:
      Inning1_bowl.remove(row)

  for i in range(len(Inning1_bowl)):
    Inning1_bowl[i].insert(0, match_name)
    Inning1_bowl[i].insert(1, title[1].text)
    Inning1_bowl[i].append(Links[j])

  
  rows = soup5.find_all('table')[3].tbody.find_all('tr')

  Inning2_bowl = []
  for row in rows:
    row_data = []

    for data in row:
      row_data.append(data.text)
    
    Inning2_bowl.append(row_data)

  for row in Inning2_bowl:
    if len(row)!=11:
      Inning2_bowl.remove(row)


  for i in range(len(Inning2_bowl)):
    Inning2_bowl[i].insert(0, match_name)
    Inning2_bowl[i].insert(1, title[0].text)
    Inning2_bowl[i].append(Links[j])

  # Adding both inning data
  final_bowling_data = Inning1_bowl + Inning2_bowl

  with open('Bowling_innings.csv', mode='a', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Here data is the row that will be written.
    for row in final_bowling_data:
      writer.writerow(row)
    


45


### Part of the above code to understand

In [None]:
url1 = 'https://stats.espncricinfo.com/ci/engine/match/1298135.html'
req = requests.get(url1)

In [None]:
soup5 = BeautifulSoup(req.text, "lxml")

In [None]:
# Locate the rows of the the 1st inning bowling table
rows = soup5.find_all('table')[1].tbody.find_all('tr')

# Create a list to store lists of lists, i.e. rows as a list.
Inning1_bowl = []

# Looping through each row
for row in rows:
  # Create row_data to record each data entry in a list as a separate and then append this list to Inning1_bowl
  row_data=[]
  # Looping through each data point
  for data in row:
    row_data.append(data.text)
  
  Inning1_bowl.append(row_data)

# Removing the commentary data present in unexpected rows by telling that remove those rows whose total data entry is not equal to 11.
# This will remove Comments as their length is 1
for row in Inning1_bowl:
  if len(row)!=11:
    Inning1_bowl.remove(row)


[['Maheesh Theekshana', '4', '0', '23', '1', '5.75', '7', '0', '0', '2', '0'], ['Dushmantha Chameera', '4', '0', '39', '1', '9.75', '6', '3', '1', '2', '0'], ['Pramod Madushan', '4', '0', '37', '2', '9.25', '6', '3', '1', '0', '0'], ['Chamika Karunaratne', '4', '0', '36', '1', '9.00', '7', '3', '1', '1', '0'], ['Wanindu Hasaranga de Silva', '4', '0', '27', '1', '6.75', '8', '1', '1', '0', '0']]


In [None]:
rows = soup5.find_all('table')[3].tbody.find_all('tr')

Inning2_bowl = []
for row in rows:
  row_data = []

  for data in row:
    row_data.append(data.text)
  
  Inning2_bowl.append(row_data)

for row in Inning2_bowl:
  if len(row)!=11:
    Inning2_bowl.remove(row)
print(Inning2_bowl)

[['Gerhard Erasmus', '1', '0', '8', '0', '8.00', '1', '1', '0', '0', '0'], ['David Wiese', '4', '0', '16', '2', '4.00', '13', '1', '0', '0', '0'], ['Bernard Scholtz', '4', '0', '18', '2', '4.50', '10', '1', '0', '0', '0'], ['Ben Shikongo', '3', '1', '22', '2', '7.33', '6', '3', '0', '0', '0'], ['JJ Smit', '3', '0', '16', '1', '5.33', '7', '0', '0', '1', '0'], ['Jan Frylinck', '4', '0', '26', '2', '6.50', '10', '0', '2', '1', '0']]


## **Scrapping Player Info**

### Getting Header

In [None]:
header_list = ['name', 'team','battingStyle', 'bowlingStyle', 'playingRole']


with open('Player_info.csv', mode='w', newline='') as file:
  
    # Create a writer object
    writer = csv.writer(file)

    # Write header to the file
    writer.writerow(header_list)  

### Getting Body and storing to **excel**

In [None]:
unique_number = 1298135
# j = -1
Links=[]   # Storing Storecard data column from matchresults to check for if it ends with alphabet
count = 0     # Counting the total number of entries to be filled of matches
for row in soup1.tbody.findAll('tr'):         # If it ends with alphabet, then scorecard won't be there because it's abandoned.
    Links.append(row.find_all('td')[-1].text)
print(len(Links))

# This is for storing all players name in a list which are written to excel so no duplicate record will be written
submitted_players_name = []
for i in range(len(Links)):        # 45 times
  count+=1   
  # Getting links one by one
  # j+=1
  link = 'https://stats.espncricinfo.com/ci/engine/match/'+str(unique_number)+'.html'

  unique_number += 1

  # if Links ends with a alphabet then we don't move to scorecard as match was abandoned.
  if re.search('[a-z]$', Links[i]):
    continue

  req = requests.get(link)
  soup7 = BeautifulSoup(req.text, "lxml")

  # Creating list to store link to each player's info
  player_info_links = []
  
  # It will help me skip the iteration whose record exists already or has been written before
  name_using_href = {}
  
  # Finding the link to the batting player info (only work for batsman taht are out)
  for data in soup7.find_all('td', class_='ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-flex ds-items-center'):
    for link in data.find_all('a', href=True):
      if link.text in submitted_players_name:    # if player name already present or written
        continue
      else:
        player_info_links.append(link['href'])
        # # Storing the name of player using the link of their information key
        name_using_href[link['href']] = link.text

  # Finding the link to the batting player info (For NOT OUT Batsman)
  for data in soup7.find_all('td', class_='ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-flex ds-items-center ds-border-line-primary ci-scorecard-player-notout'):
    for link in data.find_all('a', href=True):
      if link.text in submitted_players_name:
        continue
      else:
        player_info_links.append(link['href'])
      # Storing the name of player using the link of their information key
        name_using_href[link['href']] = link.text

  # Finding the link to the bowling player info
  for data in soup7.find_all('td', class_='ds-min-w-max ds-flex ds-items-center'):
    for link in data.find_all('a', href=True):
      if link.text in submitted_players_name:
        continue
      else:
        player_info_links.append(link['href'])
        # Storing the name of player using the link of their information key
        name_using_href[link['href']] = link.text

  # Iterating through each list
  # players_info = []       
  for link in player_info_links:
      
    # Here if a data that already is in our players_info list came, so this iteration of recording their info will be skipped.
    url2 = 'https://stats.espncricinfo.com' + link
    req2 = requests.get(url2)
    soup8 = BeautifulSoup(req2.text, "lxml")


    player_info = []
    # All needed information is in h5 tag, so we find that h5 tag.
    # rows = soup8.find_all('h5')
    # We only get data till the
    player_record = {}

    fields = soup8.find('div',class_ = 'ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8').find_all('p')
    values = soup8.find_all('h5')

    fields_list = [field.text for field in fields]
    values_list = [value.text for value in values]

    for i in range(len(fields_list)):
      player_record[fields_list[i]] = values_list[i]

    # if player_record['Full Name'] in submitted_players_name:
    #     continue
      
    # Appending the necessary values to player_info
    player_info.append(name_using_href[link])         # This is appended form previous 
    player_info.append(player_record['Batting Style'])
    try:
      player_info.append(player_record['Bowling Style'])
    except:
      player_record['Bowling Style'] = None
      player_info.append(player_record['Bowling Style'])
    player_info.append(player_record['Playing Role'])
    # player_info.append(Links[j])
    
    team = soup8.find('div', class_='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-y-4').find('span')
    player_info.insert(1, team.text)

    # list to store player info so that it won't be recorded or written again and again
    submitted_players_name.append(name_using_href[link])
    # for row in rows[1:6]:
    #   player_info.append(row.text)
      
    
    # players_info.append(player_info)
  
    with open('Player_info.csv', mode='a', newline='') as file:
        
      # Create a writer object
      writer = csv.writer(file)

      # Here data is the row that will be written.
      # for row in players_info:
      writer.writerow(player_info)

  print(f"Match {count} Record written")
    

45
Match 1 Record written
Match 2 Record written
Match 3 Record written
Match 4 Record written
Match 5 Record written
Match 6 Record written
Match 7 Record written
Match 8 Record written
Match 9 Record written
Match 10 Record written
Match 11 Record written
Match 12 Record written
Match 13 Record written
Match 14 Record written
Match 15 Record written
Match 16 Record written
Match 17 Record written
Match 18 Record written
Match 19 Record written
Match 20 Record written
Match 22 Record written
Match 23 Record written
Match 24 Record written
Match 27 Record written
Match 28 Record written
Match 29 Record written
Match 30 Record written
Match 31 Record written
Match 32 Record written
Match 33 Record written
Match 34 Record written
Match 35 Record written
Match 36 Record written
Match 37 Record written
Match 38 Record written
Match 39 Record written
Match 40 Record written
Match 41 Record written
Match 42 Record written
Match 43 Record written
Match 44 Record written
Match 45 Record writte

### Parts of above code to understand

In [None]:
url1 = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/england-vs-pakistan-final-1298179/full-scorecard'
req = requests.get(url1)

soup7 = BeautifulSoup(req.text, "lxml")


In [None]:
player_info_links = []
# Finding the link to the batting player info (only work for batsman taht are out)
for data in soup7.find_all('td', class_='ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-flex ds-items-center'):
  for link in data.find_all('a', href=True):
    player_info_links.append(link['href'])

# Finding the link to the batting player info (For NOT OUT Batsman)
for data in soup7.find_all('td', class_='ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-flex ds-items-center ds-border-line-primary ci-scorecard-player-notout'):
  for link in data.find_all('a', href=True):
    player_info_links.append(link['href'])

# Finding the link to the bowling player info
for data in soup7.find_all('td', class_='ds-min-w-max ds-flex ds-items-center'):
  for link in data.find_all('a', href=True):
    player_info_links.append(link['href'])

print(player_info_links)

['/cricketers/mohammad-rizwan-323389', '/cricketers/babar-azam-348144', '/cricketers/mohammad-haris-1205559', '/cricketers/shan-masood-233901', '/cricketers/iftikhar-ahmed-480603', '/cricketers/shadab-khan-922943', '/cricketers/mohammad-nawaz-348148', '/cricketers/mohammad-wasim-1185538', '/cricketers/jos-buttler-308967', '/cricketers/alex-hales-249866', '/cricketers/phil-salt-669365', '/cricketers/harry-brook-911707', '/cricketers/moeen-ali-8917', '/cricketers/shaheen-shah-afridi-1072470', '/cricketers/haris-rauf-1161606', '/cricketers/ben-stokes-311158', '/cricketers/liam-livingstone-403902', '/cricketers/ben-stokes-311158', '/cricketers/chris-woakes-247235', '/cricketers/sam-curran-662973', '/cricketers/adil-rashid-244497', '/cricketers/chris-jordan-288992', '/cricketers/liam-livingstone-403902', '/cricketers/shaheen-shah-afridi-1072470', '/cricketers/naseem-shah-1158088', '/cricketers/haris-rauf-1161606', '/cricketers/shadab-khan-922943', '/cricketers/mohammad-wasim-1185538', '/cri

In [None]:
url1 = 'https://www.espncricinfo.com/cricketers/kusal-mendis-629074'
req = requests.get(url1)

In [None]:
soup6 = BeautifulSoup(req.text, "lxml")

In [None]:
# Collecting the data for the player
player_info = []

# All needed information is in h5 tag, so we find that h5 tag.
rows = soup6.find('div', class_='ds-p-4').find_all('h5')
for row in rows:
  player_info.append(row.text)

team = soup6.find('span', class_='ds-text-title-s ds-font-bold ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block')
player_info.insert(1, team.text)
print(player_info)


['Balapuwaduge Kusal Gimhan Mendis', 'BOL Mendis', 'February 02, 1995, Moratuwa', '28y 21d', 'Right hand Bat', 'Legbreak', 'Wicketkeeper']


In [None]:
a = {'a':'Hello','b':'Bye', 'c':'Tata'}
l = [['Hello','Bye'], ['Hey','Hi']]
if a['a'] in l:
  print('yes')

In [None]:
# Collecting the data for the player
player_info = []

# All needed information is in h5 tag, so we find that h5 tag.
rows = soup6.find_all('h5')
for row in rows[:7]:
  
  print(row.text)
# for row in rows:

team = soup6.find('div', class_='ds-text-title-s ds-font-bold ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block')
# player_info.insert(1, team.text)
# print(player_info)


Balapuwaduge Kusal Gimhan Mendis
February 02, 1995, Moratuwa
28y 21d
Right hand Bat
Legbreak
Wicketkeeper
Wicketkeeper Batter


In [None]:
team = soup6.find('div', class_='ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-y-4').find('span')
print(team.text)
    

Sri Lanka


In [None]:
# Collecting the data for the player
player_record = {}

fields = soup6.find('div',class_ = 'ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8').find_all('p')
values = soup6.find_all('h5')

fields_list = [field.text for field in fields]
values_list = [value.text for value in values]

for i in range(len(fields_list)):
  player_record[fields_list[i]] = values_list[i]
# for row in rows:

print(player_info)
team = soup6.find('div', class_='ds-text-title-s ds-font-bold ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block')
# player_info.insert(1, team.text)
# print(player_info)


{'Full Name': 'Balapuwaduge Kusal Gimhan Mendis', 'Born': 'February 02, 1995, Moratuwa', 'Age': '28y 21d', 'Batting Style': 'Right hand Bat', 'Bowling Style': 'Legbreak', 'Fielding Position': 'Wicketkeeper', 'Playing Role': 'Wicketkeeper Batter', 'Education': 'Prince of Wales College, Moratuwa', 'RELATIONS': 'Batting & Fielding', ' (brother)': 'Bowling'}


# **Data Preprocessing**

### Match summary

In [None]:
df_match = pd.read_csv('/content/MatchResults.csv')
df_match.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830


In [None]:
df_match.shape

(45, 7)

In [None]:
# Renaming the Scorecard to match_id, as we'll be using it as a key to connect our table.
df_match.rename({'Scorecard' : 'match_id'}, axis=1, inplace=True)
df_match.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,match_id
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825
2,Scotland,West Indies,Scotland,42 runs,Hobart,"Oct 17, 2022",T20I # 1826
3,Ireland,Zimbabwe,Zimbabwe,31 runs,Hobart,"Oct 17, 2022",T20I # 1828
4,Namibia,Netherlands,Netherlands,5 wickets,Geelong,"Oct 18, 2022",T20I # 1830


**Exporting the file to csv**

In [None]:
df_match.to_csv('/content/drive/MyDrive/Projects/Cricket Data Analysis/Transformed csv files/Match_summary.csv', index=False)

### Batting summary

In [None]:
df_batting = pd.read_csv('/content/Batsman_innings.csv')
df_batting.head()

Unnamed: 0,match,teamInning,battingPosition,batsman,dismissal,R,B,M,4s,6s,SR,match_id
0,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,c Pramod Madushan b Chameera,3,6,7,0,0,50.0,T20I # 1823
1,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,c Shanaka b Pramod Madushan,9,9,15,1,0,100.0,T20I # 1823
2,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,c †Mendis b Karunaratne,20,12,18,1,2,166.66,T20I # 1823
3,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,c DM de Silva b Pramod Madushan,26,24,49,2,0,108.33,T20I # 1823
4,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),c Gunathilaka b PWH de Silva,20,24,30,0,0,83.33,T20I # 1823


In [None]:
# Creating a new column out/not_out and getting values from dismissal column
df_batting['out/not_out'] = df_batting.dismissal.apply(lambda x: "not_out" if x.strip()=="not out" else 'out')
df_batting.head(10)

Unnamed: 0,match,teamInning,battingPosition,batsman,dismissal,R,B,M,4s,6s,SR,match_id,out/not_out
0,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,c Pramod Madushan b Chameera,3,6,7,0,0,50.0,T20I # 1823,out
1,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,c Shanaka b Pramod Madushan,9,9,15,1,0,100.0,T20I # 1823,out
2,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,c †Mendis b Karunaratne,20,12,18,1,2,166.66,T20I # 1823,out
3,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,c DM de Silva b Pramod Madushan,26,24,49,2,0,108.33,T20I # 1823,out
4,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),c Gunathilaka b PWH de Silva,20,24,30,0,0,83.33,T20I # 1823,out
5,Namibia Vs Sri Lanka,Namibia,6,Jan Frylinck,run out (Gunathilaka/†Mendis),44,28,44,4,0,157.14,T20I # 1823,out
6,Namibia Vs Sri Lanka,Namibia,7,David Wiese,c †Mendis b Theekshana,0,1,4,0,0,0.0,T20I # 1823,out
7,Namibia Vs Sri Lanka,Namibia,8,JJ Smit,not out,31,16,29,2,2,193.75,T20I # 1823,not_out
8,Namibia Vs Sri Lanka,Sri Lanka,1,Pathum Nissanka,c Smit b Shikongo,9,10,16,1,0,90.0,T20I # 1823,out
9,Namibia Vs Sri Lanka,Sri Lanka,2,Kusal Mendis †,c †Green b Wiese,6,6,8,0,0,100.0,T20I # 1823,out


In [None]:
# dropping the dismissal column
df_batting.drop(columns=['dismissal'], inplace=True)
df_batting.head(10)

Unnamed: 0,match,teamInning,battingPosition,batsman,R,B,M,4s,6s,SR,match_id,out/not_out
0,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,3,6,7,0,0,50.0,T20I # 1823,out
1,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,9,9,15,1,0,100.0,T20I # 1823,out
2,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,20,12,18,1,2,166.66,T20I # 1823,out
3,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,26,24,49,2,0,108.33,T20I # 1823,out
4,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),20,24,30,0,0,83.33,T20I # 1823,out
5,Namibia Vs Sri Lanka,Namibia,6,Jan Frylinck,44,28,44,4,0,157.14,T20I # 1823,out
6,Namibia Vs Sri Lanka,Namibia,7,David Wiese,0,1,4,0,0,0.0,T20I # 1823,out
7,Namibia Vs Sri Lanka,Namibia,8,JJ Smit,31,16,29,2,2,193.75,T20I # 1823,not_out
8,Namibia Vs Sri Lanka,Sri Lanka,1,Pathum Nissanka,9,10,16,1,0,90.0,T20I # 1823,out
9,Namibia Vs Sri Lanka,Sri Lanka,2,Kusal Mendis †,6,6,8,0,0,100.0,T20I # 1823,out


In [None]:
# Removing special character from batsman name (like at the end of last entry's name)
df_batting['batsman'] = df_batting['batsman'].apply(lambda x: x.replace('†',''))
df_batting.head(10)

Unnamed: 0,match,teamInning,battingPosition,batsman,R,B,M,4s,6s,SR,match_id,out/not_out
0,Namibia Vs Sri Lanka,Namibia,1,Michael van Lingen,3,6,7,0,0,50.0,T20I # 1823,out
1,Namibia Vs Sri Lanka,Namibia,2,Divan la Cock,9,9,15,1,0,100.0,T20I # 1823,out
2,Namibia Vs Sri Lanka,Namibia,3,Jan Nicol Loftie-Eaton,20,12,18,1,2,166.66,T20I # 1823,out
3,Namibia Vs Sri Lanka,Namibia,4,Stephan Baard,26,24,49,2,0,108.33,T20I # 1823,out
4,Namibia Vs Sri Lanka,Namibia,5,Gerhard Erasmus (c),20,24,30,0,0,83.33,T20I # 1823,out
5,Namibia Vs Sri Lanka,Namibia,6,Jan Frylinck,44,28,44,4,0,157.14,T20I # 1823,out
6,Namibia Vs Sri Lanka,Namibia,7,David Wiese,0,1,4,0,0,0.0,T20I # 1823,out
7,Namibia Vs Sri Lanka,Namibia,8,JJ Smit,31,16,29,2,2,193.75,T20I # 1823,not_out
8,Namibia Vs Sri Lanka,Sri Lanka,1,Pathum Nissanka,9,10,16,1,0,90.0,T20I # 1823,out
9,Namibia Vs Sri Lanka,Sri Lanka,2,Kusal Mendis,6,6,8,0,0,100.0,T20I # 1823,out


**Exporting the file to csv**

In [None]:
# Export the dataframe to csv in the location with the given name. (First mount the drive before running)
df_batting.to_csv('/content/drive/MyDrive/Projects/Cricket Data Analysis/Transformed csv files/Batting_summary.csv', index=False)

### Bowling summary

In [None]:
df_bowling = pd.read_csv('/content/Bowling_innings.csv')
df_bowling.head()

Unnamed: 0,natch,bowlingTeam,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB,match_id
0,Namibia Vs Sri Lanka,Sri Lanka,Maheesh Theekshana,4.0,0,23,1,5.75,7,0,0,2,0,T20I # 1823
1,Namibia Vs Sri Lanka,Sri Lanka,Dushmantha Chameera,4.0,0,39,1,9.75,6,3,1,2,0,T20I # 1823
2,Namibia Vs Sri Lanka,Sri Lanka,Pramod Madushan,4.0,0,37,2,9.25,6,3,1,0,0,T20I # 1823
3,Namibia Vs Sri Lanka,Sri Lanka,Chamika Karunaratne,4.0,0,36,1,9.0,7,3,1,1,0,T20I # 1823
4,Namibia Vs Sri Lanka,Sri Lanka,Wanindu Hasaranga de Silva,4.0,0,27,1,6.75,8,1,1,0,0,T20I # 1823


In [None]:
df_bowling.shape

(500, 14)

**Exporting the file to csv**

In [None]:
df_bowling.to_csv('/content/drive/MyDrive/Projects/Cricket Data Analysis/Transformed csv files/Bowling_summary.csv', index=False)

### Player Info

In [None]:
df_players = pd.read_csv('/content/Player_info.csv')
df_players.head()

Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole
0,Michael van Lingen,Namibia,Left hand Bat,Left arm Medium,Bowling Allrounder
1,Divan la Cock,Namibia,Right hand Bat,Legbreak,Opening Batter
2,Jan Nicol Loftie-Eaton,Namibia,Left hand Bat,"Right arm Medium, Legbreak",Batter
3,Stephan Baard,Namibia,Right hand Bat,Right arm Medium fast,Batter
4,Gerhard Erasmus,Namibia,Right hand Bat,Right arm Offbreak,Allrounder


In [None]:
len(df_players)

389

In [None]:
df_players['name'] = df_players['name'].apply(lambda x: x.replace('†',''))
df_players['name'] = df_players['name'].apply(lambda x: x.strip())

In [None]:
df_players.drop_duplicates(inplace=True)

In [None]:
len(df_players)

219

In [None]:
# Let's check entry for one team to view the record
df_players[df_players['team']=='India']

Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole
247,KL Rahul,India,Right hand Bat,,Opening Batter
248,Rohit Sharma (c),India,Right hand Bat,Right arm Offbreak,Top order Batter
249,Suryakumar Yadav,India,Right hand Bat,"Right arm Medium, Right arm Offbreak",Batter
250,Axar Patel,India,Left hand Bat,Slow Left arm Orthodox,Bowling Allrounder
251,Hardik Pandya,India,Right hand Bat,Right arm Medium fast,Allrounder
252,Dinesh Karthik,India,Right hand Bat,Right arm Offbreak,Wicketkeeper Batter
255,Virat Kohli,India,Right hand Bat,Right arm Medium,Top order Batter
256,Ravichandran Ashwin,India,Right hand Bat,Right arm Offbreak,Bowling Allrounder
257,Bhuvneshwar Kumar,India,Right hand Bat,Right arm Medium,Bowler
258,Arshdeep Singh,India,Left hand Bat,Left arm Medium fast,Bowler


**Exporting the file to csv**

In [None]:
df_players.to_csv('/content/drive/MyDrive/Projects/Cricket Data Analysis/Transformed csv files/Player_info.csv', index=False)