In [1]:
import re
import colored
import pandas as pd
import datetime as dt
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.display import display

sns.set(style="whitegrid")
bar_plot='bar'
line_plot='line'
point_plot='point'
horizontal_plot='hbar'
box_plot='box'
cm=sns.light_palette("gray", as_cmap=True)
match_data_type='match'
batting_data_type='batting'
bowling_data_type='bowling'
main_count=0
count=0

In [2]:
# def merge_espn_data(Innings_Details,Match_Results,Final_Data):    
#     innings_data = pd.read_csv(Path(Innings_Details))
#     results_data = pd.read_csv(Path(Match_Results))
#     merge_col=['team','result','opposition','ground','date']
#     new_data=pd.merge(innings_data,results_data,how='inner',left_on=merge_col,right_on=merge_col)
#     new_data.to_csv(Path(Final_Data))

In [3]:
def draw_separator(is_header,separator,message):
    global main_count
    global count
    
    if is_header:
        main_count+=1
        count=0
        length=80
        print(colored.fg("grey_30")+colored.attr('bold')+separator*length)
        print(colored.fg("grey_30")+colored.attr('bold')+' '*int((length-len(message)+5)/2),str(main_count)+'. '+message)
        print(colored.fg("grey_30")+colored.attr('bold')+separator*length)
    else:
        count+=1
        print()
        separator='~'
        print(colored.fg("grey_30")+colored.attr('bold')+'Fig '+str(main_count)+'.'+str(count),'-',message)
        print(colored.fg("grey_30")+colored.attr('bold')+separator*(len(message)+12))

In [4]:
def clean_data(data_type,data,ground_data):
    ground_data=ground_data.applymap(lambda x:str(x).lower().strip())
    if data_type==match_data_type:
        data.opposition=data.opposition.apply(lambda x:re.sub('^v','',x))
        data=data.applymap(lambda x:str(x).lower().strip())
        data=data.drop_duplicates(keep='first')    
        data=data.loc[(data.result=='won')|(data.result=='lost')]    
        data.date=pd.to_datetime(data.date)
        data['year']=data.date.dt.year
        data['toss']=data.toss.apply(lambda x: 'toss ' + x)
        data['won']=data.result.apply(lambda x:1 if x=='won' else 0)
        data['lost']=data.result.apply(lambda x:1 if x=='lost' else 0)
        data['batting']=data.innings.apply(lambda x:'1st' if int(x)==1 else '2nd')
        data['score_range']=data.score.apply(lambda x:int(int(x.split('/')[0])/30)*30)
        data['wickets']=data.score.apply(lambda x:10 if len(x.split('/'))==1 else int(x.split('/')[1]))
        data['score']=data.score.apply(lambda x:int(x.split('/')[0]))
        data=pd.merge(data,ground_data,how='inner',left_on='ground',right_on='ground')
        data['series']=data.apply(lambda x:'home' if x.team==x.country else 'away',axis=1)
    elif data_type==batting_data_type:
        data['team']=data.player.apply(lambda x:re.sub('[)(]','',(re.findall('[(]\\w+[)]',x)[0])))
        data.player=data.player.apply(lambda x:re.sub('[(]\\w+[)]','',x))
        data.runs=data.runs.apply(lambda x:re.sub('[*]','',x))
        data.opposition=data.opposition.apply(lambda x:re.sub('^v','',x))
        data=data.applymap(lambda x:str(x).lower().strip())        
        data=data.drop_duplicates(keep='first')    
        data=data.loc[(data.runs!='dnb')]
        data=data.loc[(data.runs!='tdnb')]
        data.runs=data.runs.apply(lambda x:int(x))
        data.fours=data.fours.apply(lambda x:int(x))
        data.sixes=data.sixes.apply(lambda x:int(x))
        data.date=pd.to_datetime(data.date)
        data['year']=data.date.dt.year
        data['batting']=data.batting.apply(lambda x:'1st' if int(x)==1 else '2nd')
        #data['runs_range']=data.runs.apply(lambda x:int(x/30)*30)
        #data['fifties']=data.runs.apply(lambda x:1 if int(x)>=50 and int(x)<100 else 0)
        #data['hundreds']=data.runs.apply(lambda x:1 if int(x)>=100 else 0)
        data=pd.merge(data,ground_data,how='inner',left_on='ground',right_on='ground')
        data['series']=data.apply(lambda x:'home' if x.team==x.country else 'away',axis=1)
    elif data_type==bowling_data_type:
        data['team']=data.player.apply(lambda x:re.sub('[)(]','',(re.findall('[(]\\w+[)]',x)[0])))
        data.player=data.player.apply(lambda x:re.sub('[(]\\w+[)]','',x))
        data.opposition=data.opposition.apply(lambda x:re.sub('^v','',x))
        data=data.applymap(lambda x:str(x).lower().strip())        
        data=data.drop_duplicates(keep='first')    
        data.date=pd.to_datetime(data.date)
        data.wickets=data.wickets.apply(lambda x:int(x))
        data['year']=data.date.dt.year
        #data['threefer']=data.wickets.apply(lambda x:1 if int(x)>=3 and int(x)<5 else 0)
        #data['fifer']=data.wickets.apply(lambda x:1 if int(x)>=5 else 0)
        data=pd.merge(data,ground_data,how='inner',left_on='ground',right_on='ground')
        data['series']=data.apply(lambda x:'home' if x.team==x.country else 'away',axis=1)    
    return data

In [5]:
def plot_stats(df,title,plot_type,groupby,attribute,team1,team2,as_opponents=False):
    v_size=(10,4)
    h_size=(10,1)
    
    if as_opponents==False:
        if team2!='':
            plot_title=title+' ['+team1+' and '+team2+'] - Historical Data'
        else:
            plot_title=title+' ['+team1+'] - Historical Data'
    else:
        plot_title=title+' ['+team1+' vs '+team2+']'
    
    data=df.copy()
    if as_opponents==False:
        data=data.loc[(data.team==team1.lower())|(data.team==team2.lower())]
    else:
        data=data.loc[((data.team==team1.lower())&(data.opposition==team2.lower()))|
                      ((data.team==team2.lower())&(data.opposition==team1.lower()))]
    grouped_data=data.groupby([groupby,attribute])    
    plot_data=grouped_data['won','lost'].sum().reset_index()
    plot_data['win percentage']=round(plot_data.won/(plot_data.won+plot_data.lost)*100)
    impact_grouped_data=data.groupby([attribute])
    plot_impact_data=impact_grouped_data['won','lost'].sum().reset_index()
    plot_impact_data['win percentage']=round(plot_impact_data.won/(plot_impact_data.won+plot_impact_data.lost)*100)    
    #plot.drop(columns=['won','lost'],inplace=True)
    #print(plot_data)
    
    draw_separator(False,'-',plot_title)
    #display(plot_data.style.background_gradient(cmap='viridis'))    
    display(plot_data.style.background_gradient(cmap=cm))
    plt.figure(1,v_size)
    plt.title(plot_title)
    if plot_type==bar_plot:
        sns_plot=sns.barplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)
    elif plot_type==horizontal_plot:
        sns_plot=sns.barplot(x='win percentage',y=groupby,hue=attribute,data=plot_data)
    elif plot_type==line_plot:
        sns_plot=sns.lineplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)
    elif plot_type==point_plot:
        sns_plot=sns.pointplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)    
    plt.show()
    
    #if as_opponents==False:
    plot_title=title+' Impact '
    plt.figure(2,h_size)
    plt.title(plot_title)
    sns_plot=sns.barplot(x='win percentage',y=attribute,data=plot_impact_data)
    #display(plot_impact_data.style.background_gradient(cmap='viridis'))    
    display(plot_impact_data.style.background_gradient(cmap=cm))
    plt.show()

In [6]:
def get_team_stats(df,analysis_start_year,host_team,touring_team):
    match_data=df.loc[df.year>=analysis_start_year]
    plot_stats(match_data,'Win Percentage',line_plot,'year','team',host_team,touring_team)
    plot_stats(match_data,'Win Percentage',bar_plot,'year','team',host_team,touring_team,True)
    plot_stats(match_data,'Toss',line_plot,'year','toss',host_team,'')
    plot_stats(match_data,'Toss',line_plot,'year','toss',touring_team,'')
    plot_stats(match_data,'Toss',horizontal_plot,'team','toss',host_team,touring_team,True)
    plot_stats(match_data,'Batting',line_plot,'year','batting',host_team,'')
    plot_stats(match_data,'Batting',line_plot,'year','batting',touring_team,'')
    plot_stats(match_data,'Batting',horizontal_plot,'team','batting',host_team,touring_team,True)
    plot_stats(match_data,'Series',line_plot,'year','series',host_team,'')
    plot_stats(match_data,'Series',line_plot,'year','series',touring_team,'')
    plot_stats(match_data,'Series',horizontal_plot,'team','series',host_team,touring_team,True)
    plot_stats(match_data,'Scores',line_plot,'score_range','team',host_team,touring_team)
    plot_stats(match_data,'Wickets',line_plot,'wickets','team',host_team,touring_team)
    plot_stats(match_data,'Scores',bar_plot,'score_range','team',host_team,touring_team,True)
    plot_stats(match_data,'Loss of Wickets',bar_plot,'wickets','team',host_team,touring_team,True)

In [7]:
def plot_ground_stats(df,title,plot_type,groupby,attribute,grounds,teams=[]):
    v_size=(8,5)
    h_size=(10,1)
    
    if teams!=[]:
        plot_title=title+str(teams)+' - Historical Data'
    else:
        plot_title=title+' - Historical Data'
    data_ground=df.loc[(df.ground=='')]
    data=df.loc[(df.ground=='')]
    
    for ground in grounds:
        new_df=df.loc[(df.ground==ground)]
        data_ground = pd.concat([data_ground,new_df],axis=0)
    if teams!=[]:
        for team in teams:
            new_df=data_ground.loc[data_ground.team==team.lower()]
            data = pd.concat([data,new_df],axis=0)
    else:
        data=data_ground.copy()
        
    if teams!=[]:
        data['win percentage']=1
        grouped_data=data.groupby([groupby,attribute])
        if attribute!='team':
            plot_data=grouped_data['win percentage'].sum().reset_index()
        else:
            plot_data=grouped_data['won','lost'].sum().reset_index()
            plot_data['win percentage']=round(plot_data.won/(plot_data.won+plot_data.lost)*100)
    else:
        grouped_data=data.groupby([groupby,attribute])    
        plot_data=grouped_data['won','lost'].sum().reset_index()
        plot_data['win percentage']=round(plot_data.won/(plot_data.won+plot_data.lost)*100)

    draw_separator(False,'-',plot_title)
    #display(plot_data.style.background_gradient(cmap='viridis'))    
    display(plot_data.style.background_gradient(cmap=cm))
    plt.figure(1,v_size)
    plt.title(plot_title)
    if plot_type==bar_plot:
        sns_plot=sns.barplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)
    elif plot_type==horizontal_plot:
        sns_plot=sns.barplot(x='win percentage',y=groupby,hue=attribute,data=plot_data)
    elif plot_type==line_plot:
        sns_plot=sns.lineplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)
    elif plot_type==point_plot:
        sns_plot=sns.pointplot(y='win percentage',x=groupby,hue=attribute,data=plot_data)
    plt.show()

In [8]:
def get_ground_stats(df,analysis_start_year,host_team,touring_team,series_venue):
    match_data=df.copy()
    ground=[x.lower() for x in series_venue]
    plot_ground_stats(match_data,'Toss',bar_plot,'ground','toss',ground)
    plot_ground_stats(match_data,'Toss',bar_plot,'ground','toss',ground,[host_team])
    plot_ground_stats(match_data,'Toss',bar_plot,'ground','toss',ground,[touring_team])

    plot_ground_stats(match_data,'Batting',bar_plot,'ground','batting',ground)
    plot_ground_stats(match_data,'Batting',bar_plot,'ground','batting',ground,[host_team])
    plot_ground_stats(match_data,'Batting',bar_plot,'ground','batting',ground,[touring_team])

    plot_ground_stats(match_data,'Win Percentage',bar_plot,'ground','team',ground,[host_team,touring_team])

In [9]:
def plot_graph(plot_data,plot_type,plot_title,x,y,hue):
    v_size=(8,5)
    h_size=(10,3)
    
    draw_separator(False,'-',plot_title)
    #display(plot_data.style.background_gradient(cmap='viridis'))    
    display(plot_data.style.background_gradient(cmap=cm))    
    if plot_type==bar_plot:
        plt.figure(1,v_size)
        sns_plot=sns.barplot(y=y,x=x,hue=hue,data=plot_data)
    elif plot_type==horizontal_plot:
        plt.figure(2,h_size)        
        sns_plot=sns.barplot(x=y,y=x,hue=hue,data=plot_data)
    elif plot_type==line_plot:
        plt.figure(1,v_size)
        sns_plot=sns.lineplot(y=y,x=x,hue=hue,data=plot_data)
    elif plot_type==point_plot:
        plt.figure(1,v_size)
        sns_plot=sns.pointplot(y=y,x=x,hue=hue,data=plot_data)
    plt.title(plot_title)
    plt.show()

In [10]:
def get_counts(df,analysis_start_year,team1,team2):    
    data=df.loc[((df.team==team1.lower())|(df.team==team2.lower()))&(df.year>=analysis_start_year)].copy()
    data['total']=1
    innings_count_team=data.groupby(['team','batting'])
    plot_graph(innings_count_team['total','won','lost'].sum().reset_index(),
               horizontal_plot,'Innings Team','batting','total','team')
    
    data['total']=1
    innings_count_team=data.groupby(['team','toss'])
    plot_graph(innings_count_team['total','won','lost'].sum().reset_index(),
               horizontal_plot,'Toss Team','toss','total','team')

In [11]:
def get_averages(df,analysis_start_year,team1,team2,country,grounds):
    data=df.loc[((df.team==team1.lower())|(df.team==team2.lower()))&(df.country==country.lower())&(df.year>=analysis_start_year)]
    innings_avg_team=data.groupby(['team','batting'])    
    plot_graph(innings_avg_team.wickets.mean().reset_index(),horizontal_plot,'Team Average Wickets','batting','wickets','team')
    
    innings_avg_team=data.groupby(['team','batting'])    
    plot_graph(innings_avg_team.score.mean().reset_index(),horizontal_plot,'Team Average Scores','batting','score','team')
    
    data_ground=df.loc[(df.ground=='')]
    for ground in grounds:
        new_df=df.loc[df.ground==ground.lower()]
        data_ground=pd.concat([data_ground,new_df],axis=0)
    innings_avg_ground=data_ground.groupby(['ground','batting'])    
    plot_graph(innings_avg_ground.score.mean().reset_index(),bar_plot,'Ground Average Scores','batting','score','ground')
    
    data_ground=data_ground.loc[(data_ground.team==team1.lower())|(data_ground.team==team2.lower())]    
    innings_avg_ground_team=data_ground.groupby(['ground','team','batting'])    
    new_df=innings_avg_ground_team.score.mean().reset_index()
    
    for ground in grounds:
        ground_df=new_df.loc[new_df.ground==ground.lower()]       
        plot_graph(ground_df,horizontal_plot,'Team Average Scores ['+ground+']','batting','score','team')

In [12]:
def plot_batting_stats(df,title,plot_type,groupby,attribute,team1,team2,country='',as_opponents=False):
    v_size=(8,3)
    h_size=(8,8)
    
    if as_opponents==False:
        if team2!='':
            plot_title=title+' ['+team1+' and '+team2+'] - Historical Data'
        else:
            plot_title=title+' ['+team1+'] - Historical Data'
    else:
        plot_title=title+' ['+team1+' vs '+team2+']'
    
    data=df.copy()
    if as_opponents==False:
        data=data.loc[(data.team==team1.lower())|(data.team==team2.lower())]
    else:
        data=data.loc[((data.team==team1.lower())&(data.opposition==team2.lower()))|
                      ((data.team==team2.lower())&(data.opposition==team1.lower()))]
    if country!='':
        data=data.loc[data.country==country.lower()]
        plot_title=title+' ['+team1+' and '+team2+'] - '+country
    
    grouped_data=data.groupby([groupby])        
    plot_data=grouped_data[attribute].mean().reset_index()
    plot_data=plot_data.sort_values(by=attribute,ascending=False)
    
    draw_separator(False,'-',plot_title)
    #display(plot_data.style.background_gradient(cmap='viridis'))
    display(plot_data.style.background_gradient(cmap=cm))
    
    plot_data=grouped_data[attribute].sum().reset_index()
    plot_data=plot_data.sort_values(by=attribute,ascending=False)
    if plot_type==bar_plot:
        plt.figure(1,v_size)
        sns_plot=sns.barplot(y=attribute,x=groupby,data=plot_data)
    elif plot_type==horizontal_plot:
        plt.figure(1,h_size)
        sns_plot=sns.barplot(x=attribute,y=groupby,data=plot_data)
    elif plot_type==line_plot:
        plt.figure(1,v_size)
        sns_plot=sns.lineplot(y=attribute,x=groupby,data=plot_data)
    elif plot_type==point_plot:
        plt.figure(1,v_size)
        sns_plot=sns.pointplot(y=attribute,x=groupby,data=plot_data)
    elif plot_type==box_plot:
        plt.figure(1,(10,20))
        data=data.sort_values(by=attribute)
        sns.boxplot(y=groupby,x=attribute,data=data)        
        sns.swarmplot(y=groupby,x=attribute,data=data,color=".2")
    plt.title(plot_title)
    plt.show()        

In [13]:
def get_batting_stats(df,analysis_start_year,host_team,touring_team,players):
    data=df.loc[(df.player=='')]
    for player in players:
        new_df=df.loc[df.player==player.lower()]
        data=pd.concat([data,new_df],axis=0)        
    data=data.loc[data.year>=analysis_start_year]
    
    plot_batting_stats(data,'Runs Scored',box_plot,'player','runs',host_team,touring_team)
    plot_batting_stats(data,'Runs Scored',box_plot,'player','runs',host_team,touring_team,'',True)
    plot_batting_stats(data,'Runs Scored',box_plot,'player','runs',host_team,touring_team,host_team)
    plot_batting_stats(data,'Fours Hit',horizontal_plot,'player','fours',host_team,touring_team,host_team)
    plot_batting_stats(data,'Fours Hit',horizontal_plot,'player','fours',host_team,touring_team,'',True)
    plot_batting_stats(data,'Fours Hit',horizontal_plot,'player','fours',host_team,touring_team,host_team)
    plot_batting_stats(data,'Sixes Hit',horizontal_plot,'player','sixes',host_team,touring_team,host_team)
    plot_batting_stats(data,'Sixes Hit',horizontal_plot,'player','sixes',host_team,touring_team,'',True)
    plot_batting_stats(data,'Sixes Hit',horizontal_plot,'player','sixes',host_team,touring_team,host_team)

In [14]:
def plot_bowling_stats(df,title,plot_type,groupby,attribute,team1,team2,country='',as_opponents=False):
    v_size=(8,3)
    h_size=(8,8)
    
    if as_opponents==False:
        if team2!='':
            plot_title=title+' ['+team1+' and '+team2+'] - Historical Data'
        else:
            plot_title=title+' ['+team1+'] - Historical Data'
    else:
        plot_title=title+' ['+team1+' vs '+team2+']'
    
    data=df.copy()
    if as_opponents==False:
        data=data.loc[(data.team==team1.lower())|(data.team==team2.lower())]
    else:
        data=data.loc[((data.team==team1.lower())&(data.opposition==team2.lower()))|
                      ((data.team==team2.lower())&(data.opposition==team1.lower()))]
    if country!='':
        data=data.loc[data.country==country.lower()]
        plot_title=title+' ['+team1+' and '+team2+'] - '+country
    
    grouped_data=data.groupby([groupby])
    plot_data=grouped_data[attribute].sum().reset_index()
    plot_data=plot_data.sort_values(by=attribute,ascending=False)
    
    draw_separator(False,'-',plot_title)
    #display(plot_data.style.background_gradient(cmap='viridis'))    
    display(plot_data.style.background_gradient(cmap=cm))        
    
    plot_data=grouped_data[attribute].mean().reset_index()
    plot_data=plot_data.sort_values(by=attribute,ascending=False)
    
    if plot_type==bar_plot:
        plt.figure(1,v_size)
        sns_plot=sns.barplot(y=attribute,x=groupby,data=plot_data)
    elif plot_type==horizontal_plot:
        plt.figure(1,h_size)
        sns_plot=sns.barplot(x=attribute,y=groupby,data=plot_data)
    elif plot_type==line_plot:
        plt.figure(1,v_size)
        sns_plot=sns.lineplot(y=attribute,x=groupby,data=plot_data)
    elif plot_type==point_plot:
        plt.figure(1,v_size)
        sns_plot=sns.pointplot(y=attribute,x=groupby,data=plot_data)
    plt.title(plot_title)
    plt.show()        

In [15]:
def get_bowling_stats(df,analysis_start_year,host_team,touring_team,players):
    data=df.loc[(df.player=='')]
    for player in players:
        new_df=df.loc[df.player==player.lower()]
        data=pd.concat([data,new_df],axis=0)        
    data=data.loc[data.year>=analysis_start_year]
    
    plot_bowling_stats(data,'Wickets',horizontal_plot,'player','wickets',host_team,touring_team,'')
    plot_bowling_stats(data,'Wickets',horizontal_plot,'player','wickets',host_team,touring_team,'',True)
    plot_bowling_stats(data,'Wickets',horizontal_plot,'player','wickets',host_team,touring_team,host_team)

In [22]:
def main():
    global main_count
    
    main_count=0
    host_team='India'
    touring_team='Australia'
    series_venue=['Nagpur','Ranchi','Mohali','Delhi','Hyderabad (Deccan)']
    players=['v kohli','a zampa','aj finch','aj tye','at carey','at rayudu','b kumar','djm short','gj maxwell',
             'ja richardson','jj bumrah','jp behrendorff','kl rahul','km jadhav','kuldeep yadav','mohammed shami',
             'mp stoinis','ms dhoni','nm coulter-nile','nm lyon','pj cummins','psp handscomb','ra jadeja','rg sharma',
             'rr pant','s dhawan','s kaul','se marsh','sn thakur','ut khawaja','v shankar','ys chahal']
    ground_data_path='Grounds.csv'
    match_data_path='Data.csv'
    batting_data_path='Batting.csv'
    bowling_data_path='Bowling.csv'
    analysis_start_year=2015
    grounds=pd.read_csv(Path(ground_data_path))
    match_data=clean_data(match_data_type,pd.read_csv(Path(match_data_path)),grounds)
    batting_data=clean_data(batting_data_type,pd.read_csv(Path(batting_data_path)),grounds)
    bowling_data=clean_data(bowling_data_type,pd.read_csv(Path(bowling_data_path)),grounds)
    draw_separator(True,'=','Match Stats since year '+str(analysis_start_year))
    get_counts(match_data,analysis_start_year,host_team,touring_team)
    draw_separator(True,'=','Team Stats since year '+str(analysis_start_year))
    get_team_stats(match_data,analysis_start_year,host_team,touring_team)
    draw_separator(True,'=','Ground Stats')
    get_ground_stats(match_data,analysis_start_year,host_team,touring_team,series_venue)
    draw_separator(True,'=','Average Stats')
    get_averages(match_data,analysis_start_year,host_team,touring_team,host_team,series_venue)
    draw_separator(True,'=','Batting Stats since year '+str(analysis_start_year))
    get_batting_stats(batting_data,analysis_start_year,host_team,touring_team,players)
    draw_separator(True,'=','Bowling Stats since year '+str(analysis_start_year))
    get_bowling_stats(bowling_data,analysis_start_year,host_team,touring_team,players)

In [None]:
main()