In [1]:
#importing the necessary libraries
import pandas as pd
import os
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
from nltk.corpus import stopwords
import string

# listing all the stopwords to reduce the dimension of the commentary text
# Also making sure not to remove some words and character need in text minning
stop_words=set(stopwords.words("english"))|set(string.punctuation)
stop_words=stop_words-set(['off','on','-'])


def comment_processing(x):
    '''
    x -> a single text commentary
    
    Applying the processing technique to increase the quality of our data,
    like replacing mulptiple phrase of region/pitch with the standard phrase
    based on the phrases statistics of the commentary 
    
    Then applying word tokenizing to remove the stop-words further enhancing the qualitiy of the text.
    '''
    
    x=re.sub("(midwicket|mid wicket)","mid-wicket",x.lower())
    x=re.sub("third man","third-man",x)
    x=re.sub('(squareleg|square leg|square-leg)',"square-leg",x)
    x=re.sub('fine leg',"fine-leg",x)
    x=re.sub('extra cover',"extra-cover",x)
    x=re.sub('cover point',"cover-point",x)
    x=re.sub('points',"point",x)
    x=re.sub('covers',"cover",x)
    x=re.sub("mid on","mid-on",x)
    x=re.sub("mid off","mid-off",x)
    x=re.sub("long off","long-off",x)
    x=re.sub("long on","long-on",x)
    x=re.sub('slips',"slip",x)
    x=word_tokenize(x)
    
    return " ".join([word.strip() for word in x if word not in stop_words])

In [24]:
def rem_technique(batsman_commentary,pitch_key=None):
    '''
    The text mining algorithm used in the reference paper
    
    '''
    ground_region=['mid-wicket', 'mid-on', 'mid-off', 'long-on', 'long-off', 'third-man', 'square-leg', 'fine-leg', 'extra-cover', 'cover-point', 'point', 'cover', 'slip', 'gully']

    pitch_region={
    'bouncer':'bouncer',
    'short-length':'short and|short delivery|ball short|short ball|short-ball|short-length|short length|shorter',
    'back of length':'back of length|back-of-length',
    'good-length':'good delivery|good ball|good-ball|good-length|good length',
    'full-length':'full and|full delivery|full ball|full-ball|full-length|full length|fuller',
    'yorker':'yorker'
    }
    
    #pattern to check if a text contains edge or not
    pattern_edge='(inside|outside|bottom|leading|top|an)(\s|-)edges?'
    
    #pattern to extract all the ground region from the text
    ground_region_pattern="|".join(ground_region)
    
    #dictionary database to store the region wise score, balls -> for strong region
    #along with counts of th enumber of times a btsman played edge or he is given out. -> for weak region
    region_wise_score={rg:0 for rg in ground_region}
    region_wise_balls={rg:0 for rg in ground_region}
    out_edge_region_count={rg:0 for rg in ground_region}
    
    weak_region=None
    strong_region=None  
    
    for each_line in batsman_commentary.iterrows():
        out=each_line[1].isWicket
        score=each_line[1].batsmanRuns
        text=each_line[1].comment_text.lower()
        
        region=set(re.findall(ground_region_pattern,text))
        edge=re.findall(pattern_edge,text)
        
        if not pitch_key:
            if (edge and not score) or out:
                for gr in region:
                    out_edge_region_count[gr]+=1
            else:
                for gr in region:
                    if score:
                        region_wise_score[gr]+=score
                    region_wise_balls[gr]+=1
                
        else:
            pitch=re.findall(pitch_region[pitch_key],text)
            if pitch:
                if (edge and not score) or out:
                    for gr in region:
                        out_edge_region_count[gr]+=1
                else:
                    for gr in region:
                        if score:
                            region_wise_score[gr]+=score
                        region_wise_balls[gr]+=1
                    
        strong_regions=sorted(region_wise_score.items(),key=lambda x:x[1],reverse=True)
        weak_regions=sorted(out_edge_region_count.items(),key=lambda x:x[1],reverse=True)
            
    return strong_regions,weak_regions,region_wise_balls

In [78]:
'''
reading the top10 batsman data and choosing relevant columns
and then applyin the text processing technique on the text
'''
top10_df=pd.read_csv("./data/top10_batsman_data.csv").dropna()
top10_df=top10_df[['batting_team','batsmanRuns', 'isWicket', 'batsman_name','comment_text']]

top10_df['comment_text']=top10_df.comment_text.apply(comment_processing)

In [88]:
topk_regions=3        #top strong or weak region hereI have taken as 3
result_table={}       #store the strong and weak region of each player
strike_rate_data={}   # to store the runs and balls region wise

for batsman in top10_df.batsman_name.unique():
    result_table[batsman]={}
    
    #segment the batsman data
    batsman_data=top10_df[top10_df.batsman_name==batsman]
    
    #call the text mining algorithm
    strong_regions,weak_regions,balls_per_region=rem_technique(batsman_data)
    
    
    #you can skip this code it is just to show the output of Kohli
    if batsman=="Virat Kohli":
        print("Virat Kohli Result")
        print(f"Strong regions:\n{strong_regions[:5]}")
        print(f"Weak regions:\n{weak_regions[:5]}")
        
        
    team_name=batsman_data.batting_team.mode()[0]
    result_table[batsman]['Team']=team_name
    result_table[batsman]['Strong Region'] = [i for i, j in strong_regions[:topk_regions]]
    result_table[batsman]['Weak Region'] = [i for i, j in weak_regions[:topk_regions]]
    strike_rate_data[batsman]=[batsman,team_name,strong_regions,balls_per_region]
 

'''
Here we can see the result that algorithm has calculated
AS you can see the strong region is the region in which a player score maximum runs
and weak region is one in which he get dismissed or played edge
So we only need to choose top 2/3 region as per the paper
'''

Virat Kohli Result
Strong regions:
[('mid-wicket', 443), ('long-on', 384), ('cover', 284), ('point', 244), ('long-off', 242)]
Weak regions:
[('mid-wicket', 13), ('point', 6), ('cover', 6), ('long-on', 5), ('long-off', 3)]


In [84]:
'''
Here we printed the strong and weak region of each batsman from the above results
I have shown top-3 region for both the case since mid-wicket area is common for all batsman
'''
strong_weak_table=pd.DataFrame(result_table).T
strong_weak_table.index.name = 'Batsman Name'
strong_weak_table

Unnamed: 0_level_0,Team,Strong Region,Weak Region
Batsman Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Finch,Australia,"[mid-wicket, cover, long-on]","[mid-wicket, cover, mid-on]"
Babar Azam,Pakistan,"[mid-wicket, cover, long-on]","[mid-wicket, long-on, third-man]"
Colin Munro,New Zealand,"[mid-wicket, long-on, square-leg]","[long-on, mid-wicket, point]"
Fakhar Zaman,Pakistan,"[point, mid-wicket, cover]","[cover, mid-wicket, mid-off]"
Glenn Maxwell,Australia,"[mid-wicket, long-on, cover]","[long-on, mid-wicket, square-leg]"
Kane Williamson,New Zealand,"[mid-wicket, square-leg, cover]","[mid-wicket, cover, square-leg]"
Quinton de Kock,South Africa,"[mid-wicket, cover, square-leg]","[mid-wicket, square-leg, cover]"
Rohit Sharma,India,"[mid-wicket, long-on, point]","[mid-wicket, third-man, cover]"
Ross Taylor,New Zealand,"[mid-wicket, long-on, square-leg]","[mid-wicket, third-man, point]"
Virat Kohli,India,"[mid-wicket, long-on, cover]","[mid-wicket, point, cover]"


In [62]:
def get_region_wise_strike_rate(batsman_data):
    '''
    calculate the region wise strike rate for a givne batsman
    the formula is same as used in the refernce paper.
    '''
    
    region_strike_rate={}
    
    region_strike_rate['Batsman Name']=batsman_data[0]
    region_strike_rate['Team']=batsman_data[1]
    region_score=dict(batsman_data[2])
    region_balls=dict(batsman_data[3])
    
    for region in region_score:
        try:
            region_strike_rate[region]=round(100*region_score[region]/region_balls[region])
        except:
            region_strike_rate[region]=None
    return region_strike_rate


In [75]:
'''
compute and print the result -> region wise strike rate of top10 batsman
'''

SR_result=[]
for batsman in strike_rate_data:
    SR_result.append(get_region_wise_strike_rate(strike_rate_data[batsman]))
    
strike_rate_table=pd.DataFrame(SR_result)
strike_rate_table=strike_rate_table[['Batsman Name', 'Team','mid-wicket', 'mid-on', 'mid-off',  'square-leg','fine-leg','cover-point','extra-cover','point','cover','long-on', 'long-off', 'third-man','slip']]

strike_rate_table

Unnamed: 0,Batsman Name,Team,mid-wicket,mid-on,mid-off,square-leg,fine-leg,cover-point,extra-cover,point,cover,long-on,long-off,third-man,slip
0,Aaron Finch,Australia,212,164,157,225,254,225.0,267,166,146,253,233,157,210
1,Babar Azam,Pakistan,162,103,139,139,184,67.0,109,145,131,153,161,175,271
2,Colin Munro,New Zealand,219,132,162,248,214,,200,179,153,268,249,223,129
3,Fakhar Zaman,Pakistan,174,100,129,190,155,260.0,167,179,139,259,173,192,325
4,Glenn Maxwell,Australia,206,136,184,215,250,50.0,230,198,157,255,216,188,175
5,Kane Williamson,New Zealand,159,118,168,172,251,214.0,241,134,130,152,178,141,123
6,Quinton de Kock,South Africa,196,149,161,197,210,286.0,205,134,167,174,176,161,271
7,Rohit Sharma,India,204,138,181,180,266,94.0,129,151,123,189,216,160,136
8,Ross Taylor,New Zealand,199,80,62,198,220,140.0,150,124,102,202,143,190,182
9,Virat Kohli,India,158,149,167,168,223,131.0,236,148,142,193,205,155,229


In [92]:
'''
Although this part is not needed but I have added it

here we can get pitch-wise strong and weak region of a batsman
The paper has not said/used any approach to calculate it
So I have implemented on my own.
Uncomment it
'''

# pitch_region = ['bouncer', 'short-length', 'back of length', 'good-length', 'full-length', 'yorker']

# def pitch_wise_weak_strong(score,out_edge):
#     combined_score={}
#     combined_out_edge={}
#     for region in score[0]:
#         combined_score[region]=sum([each_pitch[region] for each_pitch in score])
#         combined_out_edge[region]=sum([each_pitch[region] for each_pitch in out_edge])
#     return combined_score,combined_out_edge

# score1,out_edge1,_=rem_technique(batsman_data,pitch_key='full-length')
# score2,out_edge2,_=rem_technique(batsman_data,pitch_key='good-length')
# pitch_wise_weak_strong([dict(score1),dict(score2)],[dict(out_edge1),dict(out_edge2)])