In [5]:
#importing essential libraries
import json
import requests
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
import time
import pickle
import os
import re

In [7]:
'''
extracting all the international t-20 matches website-links held between 2008 to 2019 year wise
'''

match_url_list={}
for year in range(2008,2020):
    print(f'Getting all the matches link for year: {year}')
    url_match_lists=f"https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=3;id={year};type=year"
    respone=requests.get(url_match_lists)
    if respone.status_code!=200:
        print("Request unsuccessful!")
        continue
        
    soupm=BeautifulSoup(respone.content)
    table=soupm.find_all("tbody")[0]
    
    #get all the links of matches for a particular year
    year_wise_match_list=[]
    for tr in table.find_all("tr"):
        year_wise_match_list.append(tr.find_all("td")[-1].a.attrs['href'].split("/")[-1].split(".")[0])
    
    #store the link of matches in a dictionary object year-wise
    match_url_list[year]=year_wise_match_list

Getting all the matches link for year: 2008
Getting all the matches link for year: 2009
Getting all the matches link for year: 2010
Getting all the matches link for year: 2011
Getting all the matches link for year: 2012
Getting all the matches link for year: 2013
Getting all the matches link for year: 2014
Getting all the matches link for year: 2015
Getting all the matches link for year: 2016
Getting all the matches link for year: 2017
Getting all the matches link for year: 2018
Getting all the matches link for year: 2019


In [8]:
def get_inning_json_data(match_id,inning_no):
    '''
    funtion to extract all the data[commentaries and other informations] of an match innings.
    '''
    
    inning_data=[]
    time.sleep(1)
    for from_over in range(2,21,2):
        commentary_url=f'https://hs-consumer-api.espncricinfo.com/v1/pages/match/comments?lang=en&seriesId={series_id}&matchId={match_id}&inningNumber={inning_no}&commentType=ALL&fromInningOver={from_over}&sortDirection=DESC'

        try:
            json_response=requests.get(commentary_url)
        except:
            print(commentary_url)
            continue

        if match_response.status_code!=200:
            print("Request unsuccessful!",commentary_url)
            continue  

        json_data=json_response.json()['comments']
        inning_data.extend(json_data)
    return inning_data

In [12]:
'''
fetching and storing the json data of all the matches commentaries and their basics info year-wise.
It's an intermediate step to first get all the data[relevant or irrelavant] and later parse it and 
processing it to extract only the relevant things.
'''

for year in match_url_list:
    yearly_commentary_data={}
    year_wise_list=match_url_list[year]
    for match_id in year_wise_list:     
        match_url=f"https://www.espncricinfo.com/matches/engine/match/{match_id}.html"
        try:
            match_response=requests.get(match_url)
        except:
            print(match_id)
            continue
        
        if match_response.status_code!=200:
            print("Request unsuccessful!",match_id)
            continue
        
        #getting full url of a given match to extract the series and match ids for reference purpose
        new_match_url=match_response.url.split("/")
        series_id=new_match_url[-3].split("-")[-1]
        match_id=new_match_url[-2].split("-")[-1]
        
        #storing some information about the matches for refernce purpoes
        events_id=series_id+"_"+match_id
        yearly_commentary_data[events_id]={}
        yearly_commentary_data[events_id]['match_name']=new_match_url[-2]
        yearly_commentary_data[events_id]['series_name']=new_match_url[-3]
        yearly_commentary_data[events_id]['year']=year
        
        
        for inning_no in [1,2]:
            print(year,events_id,inning_no)
            
            #extracting the inning json data and storing to the year-wise dictionary
            inning_data=get_inning_json_data(match_id,inning_no)
            yearly_commentary_data[events_id][inning_no]=inning_data
            
    # now the yearly json data of all the matches stored in pickle format for future use cases
    with open(f'./data/raw data/{year}.pickle', 'wb') as handle:
        pickle.dump(yearly_commentary_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Done!")
        


In [15]:
def util_func(x,arg_):
    #a helper function
    for i in arg_:
        if x in i:
            return i
    return x
    
    
def fix_name_issue(match_df):
    '''
    function to get the full name of a player from it's short name
    '''
    unique_batsman=match_df.batsman_name.unique()
    unique_bowler=match_df.bowler_name.unique()
    mask1=match_df.batsman_name==match_df.batsman_short_name
    mask2=match_df.bowler_name==match_df.bowler_short_name
    match_df.loc[mask1,'batsman_name']=match_df[mask1].batsman_short_name.apply(util_func,arg_=unique_batsman)
    match_df.loc[mask2,'bowler_name']=match_df[mask2].bowler_short_name.apply(util_func,arg_=unique_bowler)
    return match_df   
    
    
def get_match_data(match_wise_data,year):
    '''
    We parse the json file to  extract the commentaries and other important data for a given match.
    '''
    match_csv_data=[]
    
    for inning_no in [1,2]:
        
        Bowlers={}
        Batsmen={}
        team_name=None
        
        for each_ball in match_wise_data[inning_no]:             
            #getting the batsman and bowler name for a given over
            if each_ball['over']:
                Bowlers=[]
                Batsmen=[]
                team_name=each_ball['over']['team']['longName']
                       
                for i in range(len(each_ball['over']['overEndBatsmen'])):
                    player=each_ball['over']['overEndBatsmen'][i]['player']
                    Batsmen.append(player['longName'])

                for i in range(len(each_ball['over']['overEndBowlers'])):
                    player=each_ball['over']['overEndBowlers'][i]['player']
                    Bowlers.append(player['longName'])

                    
            #dictionary database to store match info, players info and commentary of each ball 
            #along with the score
            per_ball_data={}
            per_ball_data['match_id']=eventid.split("_")[1]
            per_ball_data['match_info']=match_wise_data['match_name']

            per_ball_data['year']=year
            per_ball_data['batting_team']=team_name
            per_ball_data['inning_number']=inning_no

            for attributes in ['oversActual','totalRuns','batsmanRuns','isFour', 'isSix', 'isWicket','byes','wides','legbyes','noballs','title']:
                per_ball_data[attributes]=each_ball[attributes]


            bowler,batter=per_ball_data['title'].split(' to ')
            batter=batter.strip()
            bowler=bowler.strip()


            per_ball_data["bowler_short_name"]=bowler
            per_ball_data["batsman_short_name"]=batter

            batterid=None
            bowlerid=None

            for bt in Batsmen:
                if batter in bt:
                    batter=bt
                    break

            for bw in Bowlers:
                if bowler in bw:
                    bowler=bw 
                    break

            per_ball_data["batsman_name"]=batter
            per_ball_data["bowler_name"]=bowler

            per_ball_data["comment_text"]=None
            if each_ball['commentTextItems']:
                per_ball_data["comment_text"]=each_ball['commentTextItems'][0]['html']

            match_csv_data.append(per_ball_data)
            
        print(year,eventid,inning_no,"Done!")
        return match_csv_data   

In [18]:
'''
Here we extracted the relevant data from the json files like commentary, player's name, team names etc for each ball.
For each match we extracted the data, did some preporcessing and stored it to a csv file.
later we would combined all the match-wise csv file to a single csv file.
'''

import os
if not os.path.isdir("./csv_data"):
    os.mkdir("./csv_data")

    
file_names = [
    '2008.pickle', '2009.pickle', '2010.pickle', '2011.pickle', '2012.pickle', '2013.pickle',
    '2014.pickle', '2015.pickle', '2016.pickle', '2017.pickle', '2018.pickle', '2019.pickle'
]
    
for file in file_names:
    print(file)
    
    with open(f"./data/raw data/{file}", 'rb') as handle:
        year_wise_data = pickle.load(handle)
        
        for eventid in year_wise_data:
            match_wise_data=year_wise_data[eventid]
            year=match_wise_data['year']
            
            #getting the processed match-wise data and coveritng it to data-frame to store it in a csv file
            match_csv_data=get_match_data(match_wise_data,year)
            match_df=pd.DataFrame(match_csv_data)
            if len(match_csv_data)==0:
                continue
            
            #fixing name issue for a few players
            match_df=fix_name_issue(match_df)
            
            #storing the match-wise csv file which would be merged to a single file.
            match_df=pd.DataFrame(match_df).sort_values(["inning_number","oversActual"])
            match_df.to_csv(f"./csv_data/{year}_{eventid}.csv",index=False)
            break

In [20]:
'''
merging all the match-wise csv data to a single csv file 
'''

all_data=pd.DataFrame()

for csv_file in os.listdir("./csv_data/"):
    match_df=pd.read_csv(f"./csv_data/{csv_file}")
    all_data=pd.concat([all_data,match_df],axis=0)
    
all_data.to_csv("./data/all_data.csv",index=False)  


# a glimpse of the whole data
all_data.head()

Unnamed: 0,match_id,match_info,year,batting_team,inning_number,oversActual,totalRuns,batsmanRuns,isFour,isSix,...,byes,wides,legbyes,noballs,title,bowler_short_name,batsman_short_name,batsman_name,bowler_name,comment_text
0,298804,south-africa-vs-west-indies-2nd-t20i-298804,2008,West Indies,1,0.1,1,1,False,False,...,0,0,0,0,Pollock to Smith,Pollock,Smith,Devon Smith,Shaun Pollock,"outside the off stump, Smith whips it with wri..."
1,298804,south-africa-vs-west-indies-2nd-t20i-298804,2008,West Indies,1,0.2,0,0,False,False,...,0,0,0,0,Pollock to Ramdin,Pollock,Ramdin,Denesh Ramdin,Shaun Pollock,a swing and a miss from Ramdin! A lovely outsw...
2,298804,south-africa-vs-west-indies-2nd-t20i-298804,2008,West Indies,1,0.3,0,0,False,False,...,0,0,0,0,Pollock to Ramdin,Pollock,Ramdin,Denesh Ramdin,Shaun Pollock,and again he's beaten with a slightly more hor...
3,298804,south-africa-vs-west-indies-2nd-t20i-298804,2008,West Indies,1,0.4,2,2,False,False,...,0,0,0,0,Pollock to Ramdin,Pollock,Ramdin,Denesh Ramdin,Shaun Pollock,ping! Where's this gone? He drives through the...
4,298804,south-africa-vs-west-indies-2nd-t20i-298804,2008,West Indies,1,0.5,0,0,False,False,...,0,0,0,0,Pollock to Ramdin,Pollock,Ramdin,Denesh Ramdin,Shaun Pollock,"neater stroke, guiding this behind square but ..."


In [22]:
'''
creating a seperate data-frame a.k.a csv file for only the top-10 batsman in t20 format
Ranking was taken form year-2019 as given in the reference paper.
'''


top10=[
    'Babar Azam', 'Colin Munro', 'Ross Taylor', 'Glenn Maxwell', 'Rohit Sharma','Aaron Finch', 'Kane Williamson', 'Fakhar Zaman', 
    'Virat Kohli','Quinton de Kock', 'de Kock', 'Kohli', 'Zaman', 'Williamson', 'Finch', 'Maxwell', 'Taylor', 'Munro', 'Babar'
        ]

their_countries=['New Zealand','India','Australia','South Africa','Pakistan']

top10_df=all_data[all_data.batsman_name.isin(top10)&all_data.batting_team.isin(their_countries)]

top10_df['batsman_name']=top10_df.batsman_name.replace({
    "Babar":'Babar Azam','Zaman':'Fakhar Zaman', 'Taylor':'Ross Taylor',
    'Williamson':'Kane Williamson','Munro':'Colin Munro','Kohli':'Virat Kohli',
    'Finch':'Aaron Finch','de Kock':'Quinton de Kock','Maxwell':'Glenn Maxwell'
})


# a glimpse of the top10 batsman data
top10_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10_df['batsman_name']=top10_df.batsman_name.replace({


Unnamed: 0,match_id,match_info,year,batting_team,inning_number,oversActual,totalRuns,batsmanRuns,isFour,isSix,...,byes,wides,legbyes,noballs,title,bowler_short_name,batsman_short_name,batsman_name,bowler_name,comment_text
270,291356,australia-vs-india-only-t20i-291356,2008,India,1,4.1,0,0,False,False,...,0,0,0,0,Lee to Rohit Sharma,Lee,Rohit Sharma,Rohit Sharma,Brett Lee,"up there on off, defended on the front foot"
271,291356,australia-vs-india-only-t20i-291356,2008,India,1,4.2,1,1,False,False,...,0,0,0,0,Lee to Rohit Sharma,Lee,Rohit Sharma,Rohit Sharma,Brett Lee,bangs a bouncer down off and Sharma's off the ...
278,291356,australia-vs-india-only-t20i-291356,2008,India,1,5.3,4,4,True,False,...,0,0,0,0,Noffke to Rohit Sharma,Noffke,Rohit Sharma,Rohit Sharma,Ashley Noffke,"short ball, hooked off the nose through a very..."
279,291356,australia-vs-india-only-t20i-291356,2008,India,1,5.4,1,1,False,False,...,0,0,0,0,Noffke to Rohit Sharma,Noffke,Rohit Sharma,Rohit Sharma,Ashley Noffke,"short ball again, played down off the back foot"
281,291356,australia-vs-india-only-t20i-291356,2008,India,1,5.6,1,1,False,False,...,0,0,0,0,Noffke to Rohit Sharma,Noffke,Rohit Sharma,Rohit Sharma,Ashley Noffke,"outside off, the batsman rocks onto the back f..."


In [23]:
# No of commentatries for our top10 batsman
top10_df.batsman_name.value_counts()

Virat Kohli        1960
Rohit Sharma       1897
Ross Taylor        1359
Kane Williamson    1253
Aaron Finch        1236
Babar Azam         1138
Glenn Maxwell      1029
Colin Munro         991
Quinton de Kock     810
Fakhar Zaman        566
Name: batsman_name, dtype: int64

In [114]:
# writing the csv data of top10 batsman to a csv file for later use case
top10_df[['match_id', 'match_info', 'year', 'batting_team', 'inning_number','totalRuns', 'batsmanRuns', 'isFour', 'isSix',
       'isWicket', 'byes', 'wides', 'legbyes', 'noballs', 'batsman_name',
       'bowler_name', 'comment_text']].sort_values("batsman_name").to_csv("./data/top10_batsman_data.csv",index=False)