In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
import dateutil
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle

In [2]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

In [3]:
cutoff_start_year = '2013'
cutoff_start_date = datetime.strptime(cutoff_start_year+'-01-01','%Y-%m-%d')

In [4]:
match_summary_df = pd.read_csv('csv_data/match_list.csv',parse_dates=['date'],date_parser=custom_date_parser)
recent_match_summary_df = match_summary_df[match_summary_df['date']>=cutoff_start_date]
recent_match_summary_df.shape

(781, 11)

In [5]:
match_id_list = list(recent_match_summary_df['match_id'].unique())

# Jump to LOOP

In [6]:
selected_match_id = match_id_list[0]

In [8]:
is_train = recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['train_data'].values[0]

if is_train:
    match_detail_df = pd.read_csv('csv_data/train/'+str(selected_match_id)+'.csv')
else:
    match_detail_df = pd.read_csv('csv_data/test/'+str(selected_match_id)+'.csv')




In [9]:
match_detail_df.columns

Index(['match_id', 'innings', 'team', 'opponent', 'ball', 'batsman',
       'non_striker', 'bowler', 'scored_runs', 'extras', 'total', 'extra_type',
       'wicket', 'wicket_type', 'player_out', 'fielders', 'winner'],
      dtype='object')

In [10]:
recent_match_summary_df.columns

Index(['match_id', 'date', 'location', 'first_innings', 'second_innings',
       'winner', 'win_by', 'win_dif', 'toss_winner', 'player_of_match',
       'train_data'],
      dtype='object')

In [18]:
first_innings = recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['first_innings'].values[0]
second_innings= recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['second_innings'].values[0]

first_innings_team_batsman_list = list(match_detail_df[match_detail_df['team']==first_innings]['batsman'].unique())
first_innings_team_bowler_list = list(match_detail_df[match_detail_df['team']==second_innings]['bowler'].unique())


first_innings_team_stats = {}
first_innings_team_stats['match_id']=selected_match_id
first_innings_team_stats['team_statistics']=first_innings

   
for bi in range(11):
    if bi<len(first_innings_team_batsman_list):
        batsman = first_innings_team_batsman_list[bi]
        first_innings_team_stats['batsman_'+str(bi+1)]= batsman
        first_innings_team_stats['batsman_'+str(bi+1)+'_runs']= match_detail_df[match_detail_df['batsman']==batsman]['scored_runs'].sum()
    else:
        first_innings_team_stats['batsman_'+str(bi+1)]= 'not_batted'
        first_innings_team_stats['batsman_'+str(bi+1)+'_runs']= 0
        
for boi in range(7):
    if boi<len(first_innings_team_bowler_list):
        bowler = first_innings_team_bowler_list[boi]
        first_innings_team_stats['bowler_'+str(boi+1)]= bowler
        first_innings_team_stats['bowler_'+str(boi+1)+'_wickets']= \
        match_detail_df[match_detail_df['bowler']==bowler]['wicket'].sum() -\
        match_detail_df[(match_detail_df['bowler']==bowler) & (match_detail_df['wicket_type']=='run out')].shape[0]
        
    else:
        first_innings_team_stats['bowler_'+str(boi+1)]= 'not_bowled'
        first_innings_team_stats['bowler_'+str(boi+1)+'_wickets']= 0
        
first_innings_team_stats['total_run']=match_detail_df[match_detail_df['team']==first_innings]['total'].sum()
first_innings_team_stats['total_wickets']=match_detail_df[match_detail_df['team']==second_innings]['wicket'].sum()



    
        
    





In [14]:
first_innings_team_stats

{'match_id': 661689,
 'team_statistics': 'New Zealand',
 'batsman_1': 'MJ Guptill',
 'batsman_1_runs': 1,
 'batsman_2': 'JD Ryder',
 'batsman_2_runs': 104,
 'batsman_3': 'BB McCullum',
 'batsman_3_runs': 33,
 'batsman_4': 'LRPL Taylor',
 'batsman_4_runs': 9,
 'batsman_5': 'CJ Anderson',
 'batsman_5_runs': 131,
 'batsman_6': 'L Ronchi',
 'batsman_6_runs': 3,
 'batsman_7': 'not_batted',
 'batsman_7_runs': 0,
 'batsman_8': 'not_batted',
 'batsman_8_runs': 0,
 'batsman_9': 'not_batted',
 'batsman_9_runs': 0,
 'batsman_10': 'not_batted',
 'batsman_10_runs': 0,
 'batsman_11': 'not_batted',
 'batsman_11_runs': 0,
 'bowler_1': 'KD Mills',
 'bowler_1_wickets': 1,
 'bowler_2': 'MJ McClenaghan',
 'bowler_2_wickets': 2,
 'bowler_3': 'NL McCullum',
 'bowler_3_wickets': 0,
 'bowler_4': 'JD Ryder',
 'bowler_4_wickets': 1,
 'bowler_5': 'JDS Neesham',
 'bowler_5_wickets': 1,
 'bowler_6': 'MJ Guptill',
 'bowler_6_wickets': 0,
 'bowler_7': 'AF Milne',
 'bowler_7_wickets': 0,
 'total_run': 283,
 'total_wi

In [22]:
first_innings = recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['first_innings'].values[0]
second_innings= recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['second_innings'].values[0]


for batting_innings,bowling_innings in zip([first_innings,second_innings],[second_innings,first_innings]):
    
    team_batsman_list = list(match_detail_df[match_detail_df['team']==batting_innings]['batsman'].unique())
    team_bowler_list = list(match_detail_df[match_detail_df['team']==bowling_innings]['bowler'].unique())


    team_stats = {}
    team_stats['match_id']=selected_match_id
    team_stats['team_statistics']=batting_innings

   
    for bi in range(11):
        if bi<len(team_batsman_list):
            batsman = team_batsman_list[bi]
            team_stats['batsman_'+str(bi+1)]= batsman
            team_stats['batsman_'+str(bi+1)+'_runs']= match_detail_df[match_detail_df['batsman']==batsman]['scored_runs'].sum()
        else:
            team_stats['batsman_'+str(bi+1)]= 'not_batted'
            team_stats['batsman_'+str(bi+1)+'_runs']= 0
        
    for boi in range(11):
        if boi<len(team_bowler_list):
            bowler = team_bowler_list[boi]
            team_stats['bowler_'+str(boi+1)]= bowler
            team_stats['bowler_'+str(boi+1)+'_wickets']= \
            match_detail_df[match_detail_df['bowler']==bowler]['wicket'].sum() -\
            match_detail_df[(match_detail_df['bowler']==bowler) & (match_detail_df['wicket_type']=='run out')].shape[0]

        else:
            team_stats['bowler_'+str(boi+1)]= 'not_bowled'
            team_stats['bowler_'+str(boi+1)+'_wickets']= 0

    team_stats['total_run']=match_detail_df[match_detail_df['team']==batting_innings]['total'].sum()
    team_stats['total_wickets']=match_detail_df[match_detail_df['team']==bowling_innings]['wicket'].sum()
    
    print(team_stats)
    print("==============")





{'match_id': 661689, 'team_statistics': 'New Zealand', 'batsman_1': 'MJ Guptill', 'batsman_1_runs': 1, 'batsman_2': 'JD Ryder', 'batsman_2_runs': 104, 'batsman_3': 'BB McCullum', 'batsman_3_runs': 33, 'batsman_4': 'LRPL Taylor', 'batsman_4_runs': 9, 'batsman_5': 'CJ Anderson', 'batsman_5_runs': 131, 'batsman_6': 'L Ronchi', 'batsman_6_runs': 3, 'batsman_7': 'not_batted', 'batsman_7_runs': 0, 'batsman_8': 'not_batted', 'batsman_8_runs': 0, 'batsman_9': 'not_batted', 'batsman_9_runs': 0, 'batsman_10': 'not_batted', 'batsman_10_runs': 0, 'batsman_11': 'not_batted', 'batsman_11_runs': 0, 'bowler_1': 'KD Mills', 'bowler_1_wickets': 1, 'bowler_2': 'MJ McClenaghan', 'bowler_2_wickets': 2, 'bowler_3': 'NL McCullum', 'bowler_3_wickets': 0, 'bowler_4': 'JD Ryder', 'bowler_4_wickets': 1, 'bowler_5': 'JDS Neesham', 'bowler_5_wickets': 1, 'bowler_6': 'MJ Guptill', 'bowler_6_wickets': 0, 'bowler_7': 'AF Milne', 'bowler_7_wickets': 0, 'total_run': 283, 'total_wickets': 5}
{'match_id': 661689, 'team_s

# Loop

In [6]:
match_stat_list = []
batsman_set = set()
bowler_set = set()
for selected_match_id in tqdm(match_id_list):
    
    is_train = recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['train_data'].values[0]

    if is_train:
        match_detail_df = pd.read_csv('csv_data/train/'+str(selected_match_id)+'.csv')
    else:
        match_detail_df = pd.read_csv('csv_data/test/'+str(selected_match_id)+'.csv')



    first_innings = recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['first_innings'].values[0]
    second_innings= recent_match_summary_df[recent_match_summary_df['match_id']==selected_match_id]['second_innings'].values[0]


    for batting_innings,bowling_innings in zip([first_innings,second_innings],[second_innings,first_innings]):

        team_batsman_list = list(match_detail_df[match_detail_df['team']==batting_innings]['batsman'].unique())
        team_bowler_list = list(match_detail_df[match_detail_df['team']==bowling_innings]['bowler'].unique())


        team_stats = {}
        team_stats['match_id']=selected_match_id
        team_stats['team_statistics']=batting_innings
        #batsman_set=batsman_set.union(set(team_batsman_list))
        #bowler_set=bowler_set.union(set(team_bowler_list))
        concatenated_batsman_list = []
        concatenated_bowler_list = []
        for bi in range(11):
            if bi<len(team_batsman_list):
                batsman = team_batsman_list[bi]
                team_stats['batsman_'+str(bi+1)]= batsman
                team_stats['batsman_'+str(bi+1)+'_runs']= match_detail_df[match_detail_df['batsman']==batsman]['scored_runs'].sum()
                concatenated_batsman_list.append(batting_innings.strip()+' '+batsman.strip())
            else:
                team_stats['batsman_'+str(bi+1)]= 'not_batted'
                team_stats['batsman_'+str(bi+1)+'_runs']= 0

        for boi in range(11):
            if boi<len(team_bowler_list):
                bowler = team_bowler_list[boi]
                team_stats['bowler_'+str(boi+1)]= bowler
                team_stats['bowler_'+str(boi+1)+'_wickets']= \
                match_detail_df[match_detail_df['bowler']==bowler]['wicket'].sum() -\
                match_detail_df[(match_detail_df['bowler']==bowler) & (match_detail_df['wicket_type']=='run out')].shape[0]
                concatenated_bowler_list.append(batting_innings.strip()+' '+bowler.strip())
                
            else:
                team_stats['bowler_'+str(boi+1)]= 'not_bowled'
                team_stats['bowler_'+str(boi+1)+'_wickets']= 0
        batsman_set=batsman_set.union(set(concatenated_batsman_list))
        bowler_set=bowler_set.union(set(concatenated_bowler_list))
        
        team_stats['total_run']=match_detail_df[match_detail_df['team']==batting_innings]['total'].sum()
        team_stats['total_wickets']=match_detail_df[match_detail_df['team']==bowling_innings]['wicket'].sum()
        match_stat_list.append(team_stats)
        
match_stats_df = pd.DataFrame(match_stat_list)
match_stats_df.to_csv('csv_data/match_stats.csv',index=False)


HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [7]:
match_stats_df.head()

Unnamed: 0,match_id,team_statistics,batsman_1,batsman_1_runs,batsman_2,batsman_2_runs,batsman_3,batsman_3_runs,batsman_4,batsman_4_runs,...,bowler_8,bowler_8_wickets,bowler_9,bowler_9_wickets,bowler_10,bowler_10_wickets,bowler_11,bowler_11_wickets,total_run,total_wickets
0,589309,Pakistan,Nasir Jamshed,106,Mohammad Hafeez,76,Azhar Ali,2,Younis Khan,10,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,250,10
1,589309,India,G Gambhir,11,V Sehwag,31,V Kohli,6,Yuvraj Singh,9,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,165,10
2,589310,India,G Gambhir,15,AM Rahane,4,V Kohli,7,Yuvraj Singh,23,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,167,10
3,589310,Pakistan,Nasir Jamshed,34,Kamran Akmal,0,Younis Khan,6,Misbah-ul-Haq,39,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,157,10
4,565812,England,AN Cook,75,IR Bell,85,KP Pietersen,44,EJG Morgan,41,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,325,9


In [8]:
len(list(batsman_set))

845

In [9]:
len(list(bowler_set))

649

In [15]:
pickle.dump(list(batsman_set),open('batsman_list.pkl','wb'))
pickle.dump(list(bowler_set),open('bowler_list.pkl','wb'))

In [13]:
#batsman_set