In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import requests
import wikipedia
import re
import json
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urllib.request import urlopen
from scrapy import selector
import datetime as dt
import pickle
from skimage import io
from IPython.display import clear_output
import pdb
plt.style.use('ggplot')
%matplotlib inline

# Functions

In [2]:
def get_squad(batting_sheet):
    """Gets team members, captain and keeper
    """
    team = []
    
    # batsmen who batted
    batsmen = batting_sheet.find_all('div', class_ = "wrap batsmen")
    for bat in batsmen:
        player = bat.find('div', class_ = "cell batsmen").text
        if player.find('(c)')>-1:
            captain = player.replace('(c)','').replace('†','').strip().lower()
        if player.find('†')>-1:
            keeper = player.replace('†','').replace('(c)','').strip().lower()
        player = player.replace('†','').replace('(c)','').strip()
        team.append(player.lower())
    did_not_bat = batting_sheet.find('div', class_ = "wrap dnb")
    
    
    # Now batsmen who did not bat
    for dnb in did_not_bat.find_all('a'):
        dnb = dnb.text.lower()
        if dnb.find('(c)')>-1:
            captain = dnb.replace('(c)','').replace('†','').strip().lower()
        if dnb.find('†')>-1:
            keeper = dnb.replace('†','').replace('(c)','').strip().lower()
        team.append(dnb.replace('†','').replace('(c)','').strip().lower())
    return team, captain, keeper


def get_full_name(name, team):
    """Accepts last name and returns players name
    """
    
    if len(team)==0:
        return name
    name = name.lower()
    name = name.replace('†','').replace('(c)','')
    for member in team:
        if member.lower().find(name) > -1:
            return member
        
def get_innings(innings, teams):
    """Returns batting and bowling innings
    """
    
    for i,inning in enumerate(innings):
        for team in teams:
            if inning.find(team)>-1:
                innings[i] = team
    
    return innings
                

        
def get_toss_outcome(toss_string):
    pass

In [13]:

outs = ['b', 'c', 'lbw', 'st', 'hw', 'ro', 'hand', 'obs']

def how_out(dismissal, team):
    """Returns triplet tuple as (how, fielder, bowler)
    """
    
    dismissal = dismissal.lower().strip()
    print(dismissal)
    if len(dismissal)==0:
        return '','',''
    
    
    
    if dismissal == 'not out':
        return 'not out', '', ''
    
    if dismissal.find('retired ')>-1:
        return 'retired hurt', '', ''
    
    if dismissal.find('absent') > -1 and dismissal.find('hurt') > -1:
        return 'absent hurt', '', ''
    
    bowler = re.findall(r"b\s(.+)", dismissal)
    fielder = re.findall(r"c\s(.+)\sb", dismissal)
    
    ## No fielder involved
    # run out
    if dismissal.find('run out')>-1:
        how = 'run out'
        fielder = ''
        bowler = ''
        return how, fielder, bowler

    # handled
    if dismissal.find('handled')>-1:
        how = 'handled the ball'
        fielder = ''
        bowler = ''
        return how, fielder, bowler

    # obstructing the fieldsman
    if dismissal.find('obstruct') > -1:
        how = 'obstruct'
        fielder = ''
        blower = ''
        return how, fielder, bowler

    if len(bowler)>0:
        bowler = bowler[0]
    if len(fielder)>0:
        fielder = fielder[0]
        
    # caught and bowled
    if dismissal.find('&') > -1:
        how = 'caught'
        bowler = get_full_name(bowler, team)
        # fielder = bowler
        return how, bowler, bowler
    
    if len(fielder) == 0:
        keeper = re.findall(r"st\s†(.+)\sb", dismissal)
        if len(keeper)>0:
            keeper = keeper[0]
    
    # bowled
    if dismissal == 'b ' + bowler:
        how = 'bowled'
        fielder = ''
        bowler = get_full_name(bowler, team)
        return how, fielder, bowler
    
    # caught
    if len(fielder) > 0:
        how = 'caught'
        fielder = get_full_name(fielder, team)
        bowler = get_full_name(bowler, team)
        return how, fielder, bowler
    
    # lbw
    if dismissal.find('lbw')>-1 or dismissal.find('leg before')>-1:
        how = 'lbw'
        fielder = ''
        bowler = get_full_name(bowler, team)
        return how, fielder, bowler
    
    # hit wicket
    if dismissal.find('hit wicket')>-1:
        how = 'hit wicket'
        fielder = ''
        bowler = get_full_name(bowler, team)
        return how, fielder, bowler

    
    # Stumped
    if dismissal.find('st †') > -1:
        how = 'stumped'
        fielder = get_full_name(keeper, team)
        bowler = get_full_name(bowler, team)
        return 'stumped',fielder, bowler


In [14]:
dismissal = 'obstructing the field'
# bowler = re.findall(r"b\s(.+)", dismissal)
# fielder = re.findall(r"c\s(.+)\sb", dismissal)
# print(bowler, fielder)
# dismissal.find('&')
how_out(dismissal, [])

obstructing the field


('obstruct', '', [])

In [26]:
%%time
def get_scoresheet(matchID, year, link, team_1, team_2, winner, match_format):
    """Get all information about a single match
    MATCH_DETAILS: Stadium, toss, umpires, etc
    TEAM_INFO: Teams, batting team, bowling team, squad, captyain, keeper
    BATTING_INFO: Batting scorecards of all innings
    BOWLING_INFO: Bowling Scorecards of all innings
    """


    # Initialize dictionary to be returned
    all_info = {}
    

    # Define opponents
    team_names = [team_1, team_2]
    opponents = {}
    opponents[team_1] = team_2
    opponents[team_2] = team_1

    # Send get request and get HTML
    r = requests.get(link)
    soup = BeautifulSoup(r.content,'lxml')
    if soup.find('span',class_ = "cscore_notes_game").text.lower().strip().find('no result') > -1:
        return all_info

    # Number of score sheets = number of innings
    # Each scoresheet contains both batting and bowling information
    scoresheets = soup.find_all('article', class_ = "sub-module scorecard")
    match_details = soup.find('div', class_ = "match-detail-container")

    ##############################################################################################
    ############################# .   Get MATCH DETAILS   ############################################
    ##############################################################################################


    MATCH_DETAILS = {}
    MATCH_DETAILS['stadium'] = match_details.find('div', class_ = "stadium-details").text.lower().strip()
    for detail in match_details.find_all('div', class_ = "match-detail--item"):
        left = detail.find('div', class_ = "match-detail--left").text.lower().strip()
        MATCH_DETAILS[left] = []
        for r in detail.find('div', class_ = "match-detail--right").find_all('span'):
            MATCH_DETAILS[left].append(r.text.lower().strip())
    MATCH_DETAILS
    toss = re.findall(r'(.+),',MATCH_DETAILS['toss'][0])[0].strip()
    stadium = MATCH_DETAILS['stadium']


    ##############################################################################################
    ############################      Get TEAM INFO    ###########################################
    ##############################################################################################


    num_innings = len(scoresheets)

    bat_innings = []
    ball_innings = []

    squad = {}
    squad[team_1] = []
    squad[team_2] = []

    captain = {}
    captain[team_1] = []
    captain[team_2] = []

    keeper = {}
    keeper[team_1] = []
    keeper[team_2] = []

    # Get innings Info, team, captain, keeper
    for i,sheet in enumerate(scoresheets):


        if match_format == 'odi':
            my_str = sheet.find('div', class_ = "accordion-header").text.lower().strip()
            this_innings = re.findall(r'(.+)\sinnings', my_str)[0]
        elif match_format == 'test':
            my_str = sheet.find('div', class_ = "accordion-header").text.lower().strip()
            this_innings = re.findall(r'(.+)\s\d', my_str)[0]

        bat_innings.append(this_innings)

    ball_innings = [opponents[x] for x in bat_innings]

    # Get squad for team batting first
    team_batting_first = bat_innings[0]
    batting_sheet = scoresheets[0].find("div", class_="scorecard-section batsmen")
    squad[team_batting_first], captain[team_batting_first], keeper[team_batting_first] = get_squad(batting_sheet)

    # Get squad for team batting second
    team_batting_second = bat_innings[1]
    batting_sheet = scoresheets[1].find("div", class_="scorecard-section batsmen")
    squad[team_batting_second], captain[team_batting_second], keeper[team_batting_second] = get_squad(batting_sheet)


    TEAM_INFO = {}
    TEAM_INFO['num_innings'] = num_innings

    TEAM_INFO['bat_innings'] = bat_innings
    TEAM_INFO['ball_innings'] = ball_innings

    TEAM_INFO['team_batting_first'] = team_batting_first
    TEAM_INFO['team_batting_second'] = team_batting_second

    TEAM_INFO['squad'] = squad
    TEAM_INFO['captain'] = captain
    TEAM_INFO['keeper'] = keeper




    #############################################################################################
    ####################    Get Scoresheets/ Batting and Bowling        #########################
    #############################################################################################


    # Now get batting and bowling info
    BATTING = {}
    BOWLING = {}
    for s,sheet in enumerate(scoresheets):

        # batting_sheet = sheet.find("div", class_="scorecard-section batsmen")
        # bowling_sheet = sheet.find("div", class_ = "scorecard-section bowling")
        inning = 'inning_' + str(s+1)
        print("In:", inning)

        batting_team = bat_innings[s]
        bowling_team = ball_innings[s]

        #####################################################
        ############### Batting Info  #######################
        #####################################################


        batting_sheet = sheet.find("div", class_="scorecard-section batsmen")

        ## DEFINE HEADERS
        # get all the header.
        # Some games have SR and balls, some not, so important to get it on a game-by-game basis
        all_headers = []
        for header in batting_sheet.find('div', class_ = "wrap header").find_all('div'):
            all_headers.append(header.text.lower())

        # The commentary header is usually emptty: just add commentary
        if len(all_headers[1])==0:
            all_headers[1] = 'commentary'

        # Create dataframe
        all_headers =  all_headers + ['how', 'fielder', 'bowler'] + ['matchID', 'year', 'team', 'innings', 'toss', 'result', 'stadium']
        score_card_batting = pd.DataFrame(columns = all_headers)

        # Iterate over batesmen
        all_batsmen  = batting_sheet.find_all('div', class_ = "wrap batsmen")
        for i, batsman in enumerate(all_batsmen):

            # iterate over cells
            for j,info in enumerate(batsman.find_all('div')):
                # print(j,info.text, type(info.text))

                if j == 0: #name
                    player = info.text.lower().replace('(c)','').replace('†','').strip()
                    score_card_batting.loc[i, all_headers[j]] = player
                    continue
                if j == 1: #commentary
                    # print(info.text.lower().strip())
                    how,fielder,bowler = how_out(info.text.lower().strip(), squad[bowling_team])


                score_card_batting.loc[i, 'how'] = how
                score_card_batting.loc[i, 'fielder'] = fielder
                score_card_batting.loc[i, 'bowler'] = bowler
                score_card_batting.loc[i, all_headers[j]] = info.text.lower().strip()

        did_not_bat = set(squad[batting_team]) - set(score_card_batting.batsmen.values)
        #print(did_not_bat)

        l = len(score_card_batting)
        for dnb in did_not_bat:
            score_card_batting.loc[l, all_headers[0]] = dnb
            l = l+1

        # Now get extras
        extras = sheet.find('div', class_ = "wrap extras").find_all('div')[1].text
        l = len(score_card_batting)
        score_card_batting.loc[l, all_headers[0]] = 'extras'
        score_card_batting.loc[l, all_headers[1]] =  extras

        # Total
        total = sheet.find('div', class_ = "wrap total").find_all('div')[1].text
        l = len(score_card_batting)
        score_card_batting.loc[l, all_headers[0]] = 'total'
        score_card_batting.loc[l, all_headers[1]] =  total


        # Now fill match-cells: common to all batsmen
        score_card_batting.loc[:, 'matchID'] = matchID
        score_card_batting.loc[:, 'year'] = year
        score_card_batting.loc[:, 'innings'] = s+1
        score_card_batting.loc[:, 'team'] = batting_team

        score_card_batting.loc[:, 'stadium'] = stadium
        score_card_batting.loc[:, 'toss'] = toss
        score_card_batting.loc[:, 'result'] = winner

        BATTING[inning] = score_card_batting.fillna(0).replace('-','0')

        #####################################################
        ############### Bowling Info  #######################
        #####################################################


        bowling_sheet = sheet.find("div", class_ = "scorecard-section bowling")
        bowling_headers = []
        for header in bowling_sheet.find('thead').find_all('th'):
            bowling_headers.append(header.text.lower().strip())

        bowling_headers = bowling_headers +  ['matchID', 'year', 'team', 'innings']
        score_card_bowling = pd.DataFrame(columns = bowling_headers)  

        # bowlers who bowled
        for b,bowler in enumerate(bowling_sheet.find('tbody').find_all('tr')): 
            for c,cell in enumerate(bowler.find_all('td')):
                score_card_bowling.loc[b,bowling_headers[c]]= cell.text.lower().strip()
        # bowlers who did not bowl
        dnb = set(squad[bowling_team]) - set(score_card_bowling[bowling_headers[0]].values)
        l = len(score_card_bowling)
        for d in dnb:
            score_card_bowling.loc[l,bowling_headers[0]] = d
            l = l+1

        # Now fill match-cells: common to all bowlers
        score_card_bowling.loc[:, 'matchID'] = matchID
        score_card_bowling.loc[:, 'year'] = year
        score_card_bowling.loc[:, 'innings'] = s+1
        score_card_bowling.loc[:, 'team'] = bowling_team

        score_card_bowling.loc[:, 'stadium'] = stadium
        score_card_bowling.loc[:, 'toss'] = toss
        score_card_bowling.loc[:, 'result'] = winner

        BOWLING[inning] = score_card_bowling.fillna(0).replace('-','0')



    ## SAVE EVERYTHING IN 1 DICTIONARY
    all_info = {}
    all_info['link'] = link
    all_info['format'] = match_format
    all_info['team_info'] = TEAM_INFO
    all_info['match_details'] = MATCH_DETAILS
    all_info['batting'] = BATTING
    all_info['bowling'] = BOWLING

    return all_info

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 20 µs


# Get ODI Match Data

In [9]:
odis = pickle.load(open('odi_list', "rb" ))
info = {}

In [27]:
skip_odis = []
match_format = 'odi'
for year in odis.keys():
    clear_output()
#     if year != 2019:
#         continue
    table = odis[year]
    table[table.values == 'u.a.e.'] = 'united arab emirates'
    table[table.values == 'u.s.a.'] = 'united states of america'
    table[table.values == 'p.n.g.'] = 'papua new guinea'
    
    for row in table.iterrows():
        clear_output()
        matchID = row[0]
        team1 = row[1].team_1
        team2 = row[1].team_2
        winner = row[1].winner
        link = row[1].scorecard_link
        ground = row[1].ground
        
        print(matchID)
        print(link)
        
        if matchID in skip_odis:
            continue
        if matchID not in info:
            info[matchID] = get_scoresheet(matchID, year, link, team1, team2, winner, match_format)
            if len(info[matchID])==0:
                skip_odis.append(matchID)


odi # 4233
http://stats.espncricinfo.com//ci/engine/match/1187029.html
In: inning_1
c †rahul b mohammed shami
run out (jadeja/iyer/mohammed shami)
c iyer b mohammed shami
c kohli b jadeja
c sub (ys chahal) b jadeja
c iyer b kuldeep yadav
c †rahul b saini
not out
b mohammed shami
b mohammed shami
not out
In: inning_2
c starc b zampa
lbw b agar
b hazlewood
not out
not out


In [32]:
info['missed_matches'] = skip_odis

In [33]:
info.keys()

dict_keys(['odi # 1', 'odi # 2', 'odi # 3', 'odi # 4', 'odi # 5', 'odi # 6', 'odi # 7', 'odi # 8', 'odi # 9', 'odi # 10', 'odi # 11', 'odi # 12', 'odi # 13', 'odi # 14', 'odi # 15', 'odi # 16', 'odi # 17', 'odi # 18', 'odi # 19', 'odi # 20', 'odi # 21', 'odi # 22', 'odi # 23', 'odi # 24', 'odi # 25', 'odi # 26', 'odi # 27', 'odi # 28', 'odi # 29', 'odi # 30', 'odi # 31', 'odi # 32', 'odi # 33', 'odi # 34', 'odi # 35', 'odi # 36', 'odi # 37', 'odi # 38', 'odi # 39', 'odi # 40', 'odi # 41', 'odi # 42', 'odi # 43', 'odi # 44', 'odi # 45', 'odi # 46', 'odi # 47', 'odi # 48', 'odi # 49', 'odi # 50', 'odi # 51', 'odi # 52', 'odi # 53', 'odi # 54', 'odi # 55', 'odi # 56', 'odi # 57', 'odi # 58', 'odi # 59', 'odi # 60', 'odi # 61', 'odi # 62', 'odi # 63', 'odi # 64', 'odi # 65', 'odi # 66', 'odi # 67', 'odi # 68', 'odi # 69', 'odi # 70', 'odi # 71', 'odi # 72', 'odi # 73', 'odi # 74', 'odi # 75', 'odi # 76', 'odi # 77', 'odi # 78', 'odi # 79', 'odi # 80', 'odi # 81', 'odi # 82', 'odi # 83', 'o

In [34]:
pickle.dump(info,open('odi_info_new', "wb" ))

In [22]:
table[table.values == 'u.a.e.'] = 'united arab emirates'

In [23]:
table[table.values == 'united arab emirates']

Unnamed: 0_level_0,team_1,team_2,winner,margin,ground,match_date,scorecard_link
odi_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
odi # 904,united arab emirates,india,india,71 runs,sharjah,"apr 13, 1994",http://stats.espncricinfo.com//ci/engine/match...
odi # 908,united arab emirates,pakistan,pakistan,9 wickets,sharjah,"apr 17, 1994",http://stats.espncricinfo.com//ci/engine/match...


In [35]:
skip_odis

['odi # 2141',
 'odi # 2159',
 'odi # 2208',
 'odi # 2225',
 'odi # 2256',
 'odi # 2271',
 'odi # 2292',
 'odi # 2351',
 'odi # 2372',
 'odi # 2401',
 'odi # 2405',
 'odi # 2408',
 'odi # 2416',
 'odi # 2472',
 'odi # 2514',
 'odi # 2517',
 'odi # 2602',
 'odi # 2603',
 'odi # 2621',
 'odi # 2656',
 'odi # 2670',
 'odi # 2672',
 'odi # 2701',
 'odi # 2709',
 'odi # 2741',
 'odi # 2743',
 'odi # 2753',
 'odi # 2761',
 'odi # 2786',
 'odi # 2789',
 'odi # 2820',
 'odi # 2822',
 'odi # 2855',
 'odi # 2877',
 'odi # 2901',
 'odi # 2936',
 'odi # 2954',
 'odi # 3037',
 'odi # 3088',
 'odi # 3092',
 'odi # 3119',
 'odi # 3186',
 'odi # 3274',
 'odi # 3279',
 'odi # 3296',
 'odi # 3308',
 'odi # 3324',
 'odi # 3351',
 'odi # 3369',
 'odi # 3405',
 'odi # 3414',
 'odi # 3422',
 'odi # 3432',
 'odi # 3444',
 'odi # 3499',
 'odi # 3538',
 'odi # 3580',
 'odi # 3584',
 'odi # 3592',
 'odi # 3650',
 'odi # 3653',
 'odi # 3718',
 'odi # 3750',
 'odi # 3755',
 'odi # 3759',
 'odi # 3778',
 'odi # 38