In [1]:
# import
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

pd.set_option('max_columns',100)


import os
os.chdir('/Users/shubhamjain/Downloads/AV/IPL/')

In [2]:
## training data

ball_data = pd.read_csv('train/ball_by_ball_data.csv')
match_data = pd.read_csv('train/match_data.csv')
key_teams = pd.read_csv('train/key_teams.csv')
players = pd.read_csv('train/player_rosters.csv')
player_attributes = pd.read_csv('player_attributes.csv')

## test data

test_player = pd.read_csv('test/player_predictions.csv')
test_matches = pd.read_csv('test/matches_2018.csv')
test_extras = pd.read_csv('test/total_extras.csv')

## submission

sub_player = pd.read_csv('sample_submission/player_predictions.csv')
sub_extras = pd.read_csv('sample_submission/total_extras.csv')

In [3]:
ball_data['inning'].value_counts()

1    78029
2    72350
3       43
4       38
Name: inning, dtype: int64

In [4]:
ball_data.loc[ball_data['inning'] == 3,'inning'] = 1
ball_data.loc[ball_data['inning'] == 4,'inning'] = 2

In [5]:
ball_data['season'] = ball_data['match_id'].apply(lambda x: str(x).split('_')[0]).astype('int')
ball_data['match_id'] = ball_data['match_id'].apply(lambda x: str(x).split('_')[1]).astype('int')

In [6]:
match_data['match_id'] = match_data['match_id'].apply(lambda x: str(x).split('_')[1]).astype('int')

In [7]:
## selecting only last three seasons
ball_data_3 = ball_data[ball_data['season'] > 2014]

In [8]:
ball_data_3 = ball_data_3.merge(match_data, on =['season', 'match_id'], how='left')

In [9]:
## runs scored by each batsman in each match
runs_scored = ball_data_3.groupby(['season','match_id','inning','batsman'])['batsman_runs'].apply(lambda x: np.sum(x)).reset_index()

In [10]:
## balls faced
temp = ball_data_3.groupby(['season','match_id','inning','batsman'])['ball'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns = {'ball':'balls_faced'})
runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning','batsman'], how='left')
del temp

In [11]:
## total runs scored
temp = ball_data_3.groupby(['season','match_id','inning'])['batsman_runs'].apply(lambda x: np.sum(x)).reset_index().rename(columns = {'batsman_runs':'total'})
runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning'], how='left')
del temp

In [12]:
## 4 scored
temp = ball_data_3.groupby(['season','match_id','inning','batsman'])['batsman_runs'].apply(lambda x: np.count_nonzero(x == 4)).reset_index().rename(columns = {'batsman_runs':'fours'})
runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning','batsman'], how='left')
del temp

In [13]:
## 6 scored
temp = ball_data_3.groupby(['season','match_id','inning','batsman'])['batsman_runs'].apply(lambda x: np.count_nonzero(x == 6)).reset_index().rename(columns = {'batsman_runs':'sixes'})
runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning','batsman'], how='left')
del temp

In [14]:
## dot balls

temp = ball_data_3.groupby(['season','match_id','inning','batsman'])['batsman_runs'].apply(lambda x: np.count_nonzero(x == 0)).reset_index().rename(columns = {'batsman_runs':'dots'})
runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning','batsman'], how='left')
del temp

In [15]:
## additional match details
runs_scored = runs_scored.merge(match_data[['season','match_id', 'toss_winner', 'toss_decision','winner',
                             'win_by_runs', 'win_by_wickets', 'player_of_match']] , on =['season','match_id'],how='left')

In [16]:
## batsman team name and aganist team

temp = ball_data_3.groupby(['season','match_id','inning'])[['batting_team','bowling_team']].apply(lambda x: x.head(1)).reset_index().drop('level_3',axis=1)
runs_scored = runs_scored.merge(temp, on =['season','match_id','inning'], how='left')
del temp

In [17]:
runs_scored.head()

Unnamed: 0,season,match_id,inning,batsman,batsman_runs,balls_faced,total,fours,sixes,dots,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,batting_team,bowling_team
0,2015,1,1,Aaron Finch,5,5,165,1,0,3,KKR,field,KKR,0,7,Morne Morkel,MI,KKR
1,2015,1,1,Aditya Tare,7,8,165,1,0,4,KKR,field,KKR,0,7,Morne Morkel,MI,KKR
2,2015,1,1,Ambati Rayudu,0,2,165,0,0,2,KKR,field,KKR,0,7,Morne Morkel,MI,KKR
3,2015,1,1,Corey Anderson,55,41,165,4,3,16,KKR,field,KKR,0,7,Morne Morkel,MI,KKR
4,2015,1,1,Rohit Sharma,98,66,165,12,4,25,KKR,field,KKR,0,7,Morne Morkel,MI,KKR


In [18]:
## no of teams/ no of unique teams 

player = players.groupby('Player')['Team'].apply(lambda x: np.count_nonzero(x)/ x.nunique() ).reset_index().rename(columns = {'Team':'Team_Consistency', 'Player':'batsman'})
runs_scored = runs_scored.merge(player, on = 'batsman', how='left')

In [19]:
## adding players attributes

player = player_attributes[['player_name','date_of_birth', 'birth_place', 'role', 'batting_style','bowling_style']].rename(columns = {'player_name':'batsman'})
runs_scored = runs_scored.merge(player, on = 'batsman', how='left')

In [20]:
## adding player's id
temp = ball_data_3.groupby('batsman')['batsman_id'].apply(lambda x: x.head(1)).reset_index().drop('level_1',axis=1)
runs_scored = runs_scored.merge(temp, on='batsman', how='left')
del temp

In [21]:
## no of hundreds, fifty, and 30+ scores

hundreds = (runs_scored.groupby(['batsman'])['batsman_runs'].apply(lambda x: ((np.sum(x >= 100))))).reset_index().rename(columns
                                                                                                             ={'batsman_runs':'100s'})
fifties = (runs_scored.groupby(['batsman'])['batsman_runs'].apply(lambda x: ((np.sum(x >= 50))))).reset_index().rename(columns
                                                                                                             ={'batsman_runs':'50s'})
thirty = (runs_scored.groupby(['batsman'])['batsman_runs'].apply(lambda x: ((np.sum(x >= 30))))).reset_index().rename(columns
                                                                                                             ={'batsman_runs':'30+'})

In [22]:
runs_scored.head()

Unnamed: 0,season,match_id,inning,batsman,batsman_runs,balls_faced,total,fours,sixes,dots,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,batting_team,bowling_team,Team_Consistency,date_of_birth,birth_place,role,batting_style,bowling_style,batsman_id
0,2015,1,1,Aaron Finch,5,5,165,1,0,3,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,1.285714,"Nov 17, 1986","Colac, Victoria",Batsman,Right Handed Bat,Left-Arm Orthodox,322
1,2015,1,1,Aditya Tare,7,8,165,1,0,4,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,3.0,"Nov 07, 1987","Bombay (Now Mumbai), Maharashtra",WK-Batsman,Right Handed Bat,,311
2,2015,1,1,Ambati Rayudu,0,2,165,0,0,2,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,4.5,"Sep 23, 1985","Guntur, Andhra Pradesh",WK-Batsman,Right Handed Bat,Right-Arm Offbreak,309
3,2015,1,1,Corey Anderson,55,41,165,4,3,16,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,2.0,"Dec 13, 1990","Christchurch, Canterbury",Batting Allrounder,Left Handed Bat,Left-Arm Fast-Medium,519
4,2015,1,1,Rohit Sharma,98,66,165,12,4,25,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,5.5,"Apr 30, 1987","Nagpur, Maharashtra",Batsman,Right Handed Bat,Right-Arm Offbreak,30


In [23]:
runs_scored.to_csv('saved/runs_score.csv', index=False)

### Building Batsman data 

In [23]:
np.setdiff1d(test_player['player_id'] ,runs_scored['batsman_id'])

array([157, 449, 464, 467, 531, 571, 593, 596, 608, 613, 614, 617, 618,
       619, 622, 635, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646,
       648, 650, 651, 652, 654, 656, 657, 658, 659, 662, 663, 665, 666,
       669, 670, 671, 672, 673, 674, 675, 676, 678, 679, 680, 681, 683,
       684, 685, 686, 687, 688, 689, 690, 692, 693, 696])

Some players who have played in recent years 2016-17 will be considered afterwards.

In [24]:
batsman = pd.DataFrame(runs_scored['batsman'].unique(), columns= ['batsman'])

In [25]:
batsman.head()

Unnamed: 0,batsman
0,Aaron Finch
1,Aditya Tare
2,Ambati Rayudu
3,Corey Anderson
4,Rohit Sharma


In [26]:
## total runs / total balls for last 3seasons

temp = pd.DataFrame(runs_scored.groupby('batsman')['batsman_runs'].apply(lambda x: x.sum())/runs_scored.groupby('batsman')['balls_faced'].apply(lambda x: x.sum()) * 100).rename(columns = {0:'Average'}).reset_index()
batsman = batsman.merge(temp, on='batsman', how='left')

In [27]:
## no of seasons played

temp = players.groupby('Player')['Season'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns = {'Player':'batsman','Season':'Season_played'})
batsman = batsman.merge(temp, on='batsman', how='left')
del temp

In [28]:
## total runs scored

temp = runs_scored.groupby('batsman')['batsman_runs'].apply(lambda x: x.sum()).reset_index().rename(columns = {'batsman_runs':'total_runs'})
batsman = batsman.merge(temp, on='batsman', how='left')
del temp

In [29]:
## no of centuries, fifties and 30+

batsman = batsman.merge(hundreds, on='batsman', how='left')
batsman = batsman.merge(fifties, on='batsman', how='left')
batsman = batsman.merge(thirty, on='batsman', how='left')

In [30]:
## no of innings played

temp = runs_scored['batsman'].value_counts().reset_index()
temp.columns = ['batsman','total_innings']
batsman = batsman.merge(temp, on='batsman', how='left')

In [31]:
## 30+ score/ no of innings

batsman['ability'] = batsman['30+']/batsman['total_innings']

In [32]:
## no of dots/ no of balls faced

temp = pd.DataFrame(runs_scored.groupby('batsman')['dots'].apply(lambda x: x.sum() *100)/ (runs_scored.groupby('batsman')['balls_faced'].apply(lambda x: x.sum()))).reset_index().rename(columns = {0:'dot%'})
batsman = batsman.merge(temp, on='batsman', how='left')
del temp

In [33]:
## power hitting 4s + 6s/ no of balls faced
temp = runs_scored.groupby('batsman')[['fours','sixes','balls_faced']].apply(lambda x: x.sum()).reset_index()
temp['power_hitting'] = (temp['fours'] + temp['sixes'])/temp['balls_faced']
temp = temp[['batsman','power_hitting']]

batsman = batsman.merge(temp, on='batsman', how='left')
del temp

In [34]:
## check if batsman was out or not out
temp = ball_data_3[~ball_data_3['player_dismissed'].isnull()][['season','match_id','inning','player_dismissed']].rename(columns = {'player_dismissed':'batsman'})
temp['is_out'] = 1

runs_scored = runs_scored.merge(temp, on = ['season','match_id','inning','batsman'], how='left')
runs_scored['is_out'] = runs_scored['is_out'].fillna(0)

In [35]:
## consistency = number of runs / no of innings in which he got out

temp = pd.DataFrame(runs_scored.groupby('batsman')['batsman_runs'].apply(lambda x: np.sum(x))/runs_scored.groupby('batsman')['is_out'].apply(lambda x: np.count_nonzero(x == 1))).reset_index().rename(columns = {0:'consistency'})

batsman = batsman.merge(temp, on='batsman', how='left')
batsman.loc[batsman['consistency'] == np.inf, 'consistency'] = 0
del temp

In [36]:
## average in each innings = total runs score in that innings/ total innings in which we got out

temp = pd.DataFrame(runs_scored[runs_scored['inning'] == 1].groupby(['batsman'])['batsman_runs'].apply(lambda x: np.sum(x))
             /runs_scored[runs_scored['inning'] == 1].groupby(['batsman'])['is_out'].apply(lambda x: np.count_nonzero(x == 1))).reset_index().rename(columns = {0:'avg_first_ing'}).fillna(0)

batsman = batsman.merge(temp, on='batsman', how='left')
batsman.loc[batsman['avg_first_ing'] == np.inf, 'avg_first_ing'] = 0

## 2nd inngs
temp = pd.DataFrame(runs_scored[runs_scored['inning'] == 2].groupby(['batsman'])['batsman_runs'].apply(lambda x: np.sum(x))
             /runs_scored[runs_scored['inning'] == 2].groupby(['batsman'])['is_out'].apply(lambda x: np.count_nonzero(x == 1))).reset_index().rename(columns = {0:'avg_second_ing'}).fillna(0)

batsman = batsman.merge(temp, on='batsman', how='left')
batsman.loc[batsman['avg_second_ing'] == np.inf, 'avg_second_ing'] = 0

In [37]:
runs_scored.head()

Unnamed: 0,season,match_id,inning,batsman,batsman_runs,balls_faced,total,fours,sixes,dots,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,batting_team,bowling_team,Team_Consistency,date_of_birth,birth_place,role,batting_style,bowling_style,batsman_id,is_out
0,2015,1,1,Aaron Finch,5,5,165,1,0,3,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,1.285714,"Nov 17, 1986","Colac, Victoria",Batsman,Right Handed Bat,Left-Arm Orthodox,322,1.0
1,2015,1,1,Aditya Tare,7,8,165,1,0,4,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,3.0,"Nov 07, 1987","Bombay (Now Mumbai), Maharashtra",WK-Batsman,Right Handed Bat,,311,1.0
2,2015,1,1,Ambati Rayudu,0,2,165,0,0,2,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,4.5,"Sep 23, 1985","Guntur, Andhra Pradesh",WK-Batsman,Right Handed Bat,Right-Arm Offbreak,309,1.0
3,2015,1,1,Corey Anderson,55,41,165,4,3,16,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,2.0,"Dec 13, 1990","Christchurch, Canterbury",Batting Allrounder,Left Handed Bat,Left-Arm Fast-Medium,519,0.0
4,2015,1,1,Rohit Sharma,98,66,165,12,4,25,KKR,field,KKR,0,7,Morne Morkel,MI,KKR,5.5,"Apr 30, 1987","Nagpur, Maharashtra",Batsman,Right Handed Bat,Right-Arm Offbreak,30,0.0


In [38]:
## getting batman average = runs scored/ no of innings in which he got out ### vs each team

temp = runs_scored.groupby(['batsman','bowling_team']).agg({'batsman_runs':'sum',
                                                    'is_out': lambda x: np.count_nonzero(x == 1)}).reset_index()
temp['avg_team_wise'] = temp['batsman_runs']/temp['is_out']

temp = pd.pivot_table(data= temp, index = 'batsman', columns='bowling_team', values='avg_team_wise').fillna(0).replace(np.inf, 0).round(2)
temp = temp.reset_index()

batsman = batsman.merge(temp , on ='batsman', how='left')

In [39]:
## finding out 3 top order batsman

def f(x):
    x = x.drop_duplicates(keep='first')
    return (x.head(3))

temp = ball_data_3.groupby(['season','match_id','batting_team'])['batsman'].apply(lambda x: f(x)).reset_index().drop('level_3',axis=1)

top_order = temp['batsman'].value_counts()[temp['batsman'].value_counts() > 5].index
batsman['top_order'] = 0
batsman.loc[batsman['batsman'].isin(top_order), 'top_order'] = 1

In [40]:
## adding team consistency

temp = runs_scored.groupby('batsman')['Team_Consistency'].apply(lambda x: x.mean()).reset_index()
batsman = batsman.merge(temp , on='batsman', how='left')

In [41]:
## adding icc ranking of top 100 batsman

df = pd.read_csv('rating.csv')
batsman = batsman.merge(df[['batsman','current_rank']], on ='batsman', how='left')

In [42]:
## fillings missing values
batsman.loc[batsman['batsman'] == 'J P Duminy', 'Season_played'] = 7
batsman.loc[batsman['batsman'] == 'KL Rahul', 'Season_played'] = 4
batsman.loc[batsman['batsman'] == 'B Kumar', 'Season_played'] = 7


batsman['current_rank'].fillna(999, inplace=True)
batsman.fillna(0, inplace=True)

In [43]:
## adding player's price and country

temp = pd.read_csv('cricbuzz_ipl.csv')
temp = temp.rename(columns = {'player':'batsman'})

batsman = batsman.merge(temp ,on ='batsman',how='left')

In [44]:
batsman[batsman['batsman'] == 'Mahendra Singh Dhoni']

Unnamed: 0,batsman,Average,Season_played,total_runs,100s,50s,30+,total_innings,ability,dot%,power_hitting,consistency,avg_first_ing,avg_second_ing,CSK,DD,GL,KKR,KXI,MI,RCB,RPS,RR,SRH,top_order,Team_Consistency,current_rank,Price,country
14,Mahendra Singh Dhoni,119.746835,11.0,946,0,3,12,44,0.272727,41.139241,0.135443,31.533333,34.8125,27.785714,0.0,20.0,41.5,15.75,68.0,25.5,27.83,0.0,0.0,40.0,0,5.5,999.0,,


In [45]:
## last season runs
batsman['last_season_runs'] = batsman['batsman']
batsman['last_season_runs'] = batsman['last_season_runs'].map(runs_scored[runs_scored['season'] == 2017]
                                                              .groupby('batsman')['batsman_runs'].apply(lambda x: x.sum()))
batsman['last_season_runs'].fillna(0, inplace=True)

In [46]:
## role batsman , all rounder, or bowler

temp = runs_scored.groupby('batsman')['role'].apply(lambda x: x.head(1)).reset_index().drop('level_1',axis=1)
batsman = batsman.merge(temp ,on ='batsman',how='left')

In [47]:
## player of match / total inngs

batsman['mom'] = batsman['batsman']
batsman['mom'] = batsman['mom'].map(runs_scored['player_of_match'].value_counts()).fillna(0).astype('int')

batsman['mom'] = batsman['mom']/batsman['total_innings']

In [48]:
batsman.head()

Unnamed: 0,batsman,Average,Season_played,total_runs,100s,50s,30+,total_innings,ability,dot%,power_hitting,consistency,avg_first_ing,avg_second_ing,CSK,DD,GL,KKR,KXI,MI,RCB,RPS,RR,SRH,top_order,Team_Consistency,current_rank,Price,country,last_season_runs,role,mom
0,Aaron Finch,134.082397,9.0,716,0,7,10,29,0.344828,41.7603,0.194757,28.64,19.538462,38.5,0.0,33.67,0.0,21.2,24.25,33.5,33.0,48.0,10.0,26.5,1,1.285714,4.0,620.0,Australia,300.0,Batsman,1.517241
1,Aditya Tare,68.965517,9.0,40,0,0,0,7,0.0,62.068966,0.086207,5.714286,5.0,6.25,0.0,0.0,0.0,7.0,3.5,0.0,18.0,4.0,0.0,0.0,1,3.0,999.0,20.0,India,18.0,WK-Batsman,0.0
2,Ambati Rayudu,126.296959,9.0,706,0,4,11,31,0.354839,35.420394,0.150268,28.24,28.571429,27.818182,54.5,58.5,20.0,20.6,27.33,0.0,22.25,13.2,0.0,22.33,1,4.5,999.0,220.0,India,91.0,WK-Batsman,1.032258
3,Corey Anderson,112.280702,4.0,256,0,2,4,15,0.266667,50.438596,0.149123,25.6,34.6,16.6,4.0,0.0,24.0,57.0,31.0,5.0,3.0,5.0,50.0,0.0,0,2.0,73.0,,,142.0,Batting Allrounder,1.2
4,Rohit Sharma,131.850354,11.0,1304,0,11,18,46,0.391304,34.782609,0.169869,33.435897,27.136364,41.588235,34.25,31.17,27.33,83.75,9.2,0.0,40.0,35.6,13.5,22.4,1,5.5,17.0,1500.0,India,333.0,Batsman,1.826087


#### Use batting avg on each venue.

### Building bowler's data

In [49]:
bowler = pd.DataFrame(ball_data_3['bowler'].unique(), columns= ['bowler'])

In [50]:
## getting ids of bowlers
temp = players[players['Player'].isin(bowler['bowler'])][['player_id','Player']].drop_duplicates(keep='first').rename(columns = {'Player':'bowler'})

bowler = bowler.merge(temp, on ='bowler', how='left')

In [51]:
## filling null values
print (bowler[bowler['player_id'].isnull()])

bowler.loc[bowler['bowler'] == 'B Kumar', 'player_id'] = 691
bowler.loc[bowler['bowler'] == 'J P Duminy', 'player_id'] = 677

         bowler  player_id
8       B Kumar        NaN
113  J P Duminy        NaN


In [52]:
## adding player's attributes -- bowling arm and bowling style

temp = player_attributes[['player_id', 'role' , 'bowling_style']]
bowler = bowler.merge(temp, on ='player_id', how='left')

## filling missing
bowler.loc[bowler['player_id'] == 153, 'bowling_style']  = 'Right-Arm Fast-Medium'
bowler.loc[bowler['player_id'] == 602, 'bowling_style']  = 'Right-Arm Fast-Medium'

bowler['bowling_arm'] = bowler['bowling_style'].apply(lambda x: str(x).split(" ")[0])
bowler['bowling_style'] = bowler['bowling_style'].apply(lambda x: str(x).split(" ")[1])

In [53]:
## auction price and country

temp = pd.read_csv('cricbuzz_ipl.csv').rename(columns = {'player':'bowler'})
bowler = bowler.merge(temp, on = 'bowler', how='left')

bowler.loc[bowler['bowler'] == 'B Kumar', 'Price'] = 850
bowler.loc[bowler['bowler'] == 'B Kumar', 'country'] = 'India'

bowler.loc[bowler['bowler'] == 'J P Duminy', 'Price'] = 100
bowler.loc[bowler['bowler'] == 'J P Duminy', 'country'] = 'Africa'

In [54]:
## removing unsold players
#bowler = bowler[~bowler['Price'].isnull()].reset_index(drop=True)

In [55]:
## total overs bowled by each bowler

temp = ball_data_3.groupby(['season','match_id','bowler'])['over'].apply(lambda x: x.nunique()).reset_index()
temp = temp.groupby('bowler')['over'].apply(lambda x: x.sum()).reset_index().rename(columns = {'over':'overs_bowled'})

bowler = bowler.merge(temp , on='bowler', how='left')

In [56]:
## consistency == total runs given / total overs bowled

temp = ball_data_3.groupby('bowler')['batsman_runs'].apply(lambda x: x.sum()).reset_index().rename(columns = {'batsman_runs':
                                                                                                       'consistency'})

bowler = bowler.merge(temp, on = 'bowler', how='left')
bowler['consistency'] = bowler['consistency']/bowler['overs_bowled']

In [57]:
## total wickets taken & attacking == wickets/ over bowled

temp = ball_data_3[~ball_data_3['player_dismissed'].isnull()]
temp = temp[(temp['dismissal_kind'] != 'run out') ].groupby('bowler')['player_dismissed'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns= {'player_dismissed':'wickets'})

bowler = bowler.merge(temp , on='bowler', how='left')

bowler['attacking'] = bowler['wickets']/bowler['overs_bowled']

In [58]:
## total maiden overs

temp =ball_data_3.groupby(['season','match_id','bowler','over'])['batsman_runs'].apply(lambda x: x.sum()).reset_index()

temp = temp[temp['batsman_runs'] == 0].groupby('bowler')['over'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns
                                                                                                                   = {'over':'maidan_overs'})

bowler = bowler.merge(temp , on='bowler', how='left')

In [59]:
## dot % = dot/ balls 
temp = ball_data_3[ball_data['batsman_runs'] == 0].groupby(['bowler'])['ball'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns = {'ball':'dot%'})

bowler = bowler.merge(temp , on='bowler', how='left')
bowler['dot%'] = bowler['dot%'] / (bowler['overs_bowled']*6)

In [60]:
#bowler.fillna(0, inplace=True)

In [61]:
## boundary% = boundary / overs 

temp = ball_data_3[(ball_data['batsman_runs'] >= 4)].groupby(['bowler'])['ball'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns = {'ball':'boundary%'})

bowler = bowler.merge(temp , on='bowler', how='left')
bowler['boundary%'] = bowler['boundary%'] / (bowler['overs_bowled'])

In [62]:
## strike rate = wickets / ball

bowler['strike_rate'] = bowler['wickets']/ (bowler['overs_bowled'] * 6) *100

In [63]:
## last year wicket taken and matches played

temp = ball_data_3[~ball_data_3['player_dismissed'].isnull()]
temp = temp[(temp['dismissal_kind'] != 'run out') & (temp['season'] == 2017)].groupby('bowler')['player_dismissed'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns = {'player_dismissed':'orange_cap'})

bowler = bowler.merge(temp , on='bowler', how='left')

temp = ball_data_3[ball_data_3['season'] == 2017].groupby('bowler')['match_id'].apply(lambda x: (x.nunique())).reset_index().rename(columns = {'match_id':'prev_match_count'})
bowler = bowler.merge(temp , on='bowler', how='left')

In [64]:
## no of 4 or 5 wickets hauls

temp = ball_data_3[~ball_data_3['player_dismissed'].isnull()]
temp = temp[(temp['dismissal_kind'] != 'run out')].groupby(['season','match_id','bowler'])['player_dismissed'].apply(lambda x: np.count_nonzero(x)).reset_index().rename(columns= {'player_dismissed':'wickets'})

temp = temp.groupby(['bowler'])['wickets'].apply(lambda x: np.count_nonzero(x > 3)).reset_index().rename(columns = {'wickets':
                                                                                                            'hauls'})
bowler = bowler.merge(temp , on='bowler', how='left')
#bowler.fillna(0, inplace=True)

In [65]:
## match played = = experience

temp = ball_data_3.groupby(['season','bowler'])['match_id'].apply(lambda x: (x.nunique())).reset_index()
temp = temp.groupby('bowler')['match_id'].apply(lambda x: x.sum()).reset_index().rename(columns = {'match_id': 
                                                                                                   'experience'})
bowler = bowler.merge(temp , on='bowler', how='left')

In [66]:
## team consistency

temp = players.groupby('Player')['Team'].apply(lambda x:  np.count_nonzero(x)/x.nunique()).reset_index().rename(
columns = {'Team':'Team_Consistency', 'Player':'bowler'})

bowler = bowler.merge(temp , on='bowler', how='left')

In [67]:
## player of match / matches played

temp = ball_data_3.groupby(['season','match_id'])['player_of_match'].apply(lambda x: x.head(1)).reset_index()
bowler['mom'] = bowler['bowler']

bowler['mom'] = bowler['mom'].map(temp['player_of_match'].value_counts()).fillna(0)

In [68]:
## avg wickets in each inng

temp = ball_data_3[~ball_data_3['player_dismissed'].isnull()]
temp1 = temp[(temp['dismissal_kind'] != 'run out')].groupby(['bowler', 'inning']).agg(
    {'player_dismissed':lambda x: np.count_nonzero(x)}).reset_index()

temp = ball_data_3[(ball_data_3['dismissal_kind'] != 'run out')].groupby(['season','match_id','bowler'])['inning'].apply(lambda x: x.head(1)).reset_index()
temp2 = temp.groupby('bowler')['inning'].apply(lambda x: x.value_counts()).reset_index().rename(columns= {
    'inning':'count', 'level_1':'inning'
})

temp = temp1.merge(temp2, on =['bowler','inning'], how='left')
temp['avg_inng'] = temp['player_dismissed']/ temp['count']

## first inng
temp.drop(['player_dismissed','count'],axis=1, inplace=True)
temp1 = temp[temp['inning'] == 1][['bowler','avg_inng']].rename(columns =  {'avg_inng': 'avg_first_ing'})
temp2 = temp[temp['inning'] == 2][['bowler','avg_inng']].rename(columns =  {'avg_inng': 'avg_second_ing'})

bowler = bowler.merge(temp1 , on='bowler', how='left')
bowler = bowler.merge(temp2 , on='bowler', how='left')

In [69]:
## completed 4 overs / matches played

temp = ball_data_3.groupby(['season', 'match_id' , 'bowler'])['over'].apply(lambda x: x.nunique()).reset_index()
temp = temp.groupby('bowler')['over'].apply(lambda x: np.count_nonzero(x == 4)).reset_index().rename(columns = {'over':'bharosa'})

bowler = bowler.merge(temp , on='bowler', how='left')
bowler['bharosa'] = bowler['bharosa'] / bowler['experience']

In [70]:
## runs given/ wickets taken

temp = ball_data_3.groupby(['bowler'])['batsman_runs'].apply(lambda x: x.sum()).reset_index().rename(columns = {
    'batsman_runs':'bharosa2'
})
bowler = bowler.merge(temp , on='bowler', how='left')

bowler['bharosa2'] = bowler['bharosa2']/ bowler['wickets']

In [71]:
bowler.head()

Unnamed: 0,bowler,player_id,role,bowling_style,bowling_arm,Price,country,overs_bowled,consistency,wickets,attacking,maidan_overs,dot%,boundary%,strike_rate,orange_cap,prev_match_count,hauls,experience,Team_Consistency,mom,avg_first_ing,avg_second_ing,bharosa,bharosa2
0,Tymal Mills,621,Bowler,Fast-Medium,Left-Arm,,,18,8.0,5.0,0.277778,,0.453704,1.166667,4.62963,5.0,5.0,0.0,5,1.0,0.0,1.5,0.666667,0.8,28.8
1,Aniket Choudhary,480,Bowler,Medium,Left-Arm,30.0,India,17,8.058824,5.0,0.294118,2.0,0.480392,1.117647,4.901961,5.0,5.0,0.0,5,1.5,0.0,1.5,0.666667,0.6,27.4
2,Yuzvendra Chahal,682,Bowler,Legbreak,Right-Arm,600.0,India,142,7.873239,58.0,0.408451,1.0,0.422535,0.866197,6.807512,14.0,13.0,1.0,40,4.0,0.0,1.421053,1.47619,0.7,19.275862
3,Sreenath Aravind,237,Bowler,Fast-Medium,Left-Arm,,,79,7.632911,24.0,0.303797,1.0,0.405063,0.898734,5.063291,5.0,10.0,1.0,24,6.0,0.0,0.9,1.071429,0.541667,25.125
4,Shane Watson,175,Batting Allrounder,Fast-Medium,Right-Arm,400.0,Australia,113,8.442478,31.0,0.274336,1.0,0.410029,0.982301,4.572271,5.0,8.0,1.0,34,3.666667,2.0,0.8,1.0,0.588235,30.774194


In [72]:
batsman.head()

Unnamed: 0,batsman,Average,Season_played,total_runs,100s,50s,30+,total_innings,ability,dot%,power_hitting,consistency,avg_first_ing,avg_second_ing,CSK,DD,GL,KKR,KXI,MI,RCB,RPS,RR,SRH,top_order,Team_Consistency,current_rank,Price,country,last_season_runs,role,mom
0,Aaron Finch,134.082397,9.0,716,0,7,10,29,0.344828,41.7603,0.194757,28.64,19.538462,38.5,0.0,33.67,0.0,21.2,24.25,33.5,33.0,48.0,10.0,26.5,1,1.285714,4.0,620.0,Australia,300.0,Batsman,1.517241
1,Aditya Tare,68.965517,9.0,40,0,0,0,7,0.0,62.068966,0.086207,5.714286,5.0,6.25,0.0,0.0,0.0,7.0,3.5,0.0,18.0,4.0,0.0,0.0,1,3.0,999.0,20.0,India,18.0,WK-Batsman,0.0
2,Ambati Rayudu,126.296959,9.0,706,0,4,11,31,0.354839,35.420394,0.150268,28.24,28.571429,27.818182,54.5,58.5,20.0,20.6,27.33,0.0,22.25,13.2,0.0,22.33,1,4.5,999.0,220.0,India,91.0,WK-Batsman,1.032258
3,Corey Anderson,112.280702,4.0,256,0,2,4,15,0.266667,50.438596,0.149123,25.6,34.6,16.6,4.0,0.0,24.0,57.0,31.0,5.0,3.0,5.0,50.0,0.0,0,2.0,73.0,,,142.0,Batting Allrounder,1.2
4,Rohit Sharma,131.850354,11.0,1304,0,11,18,46,0.391304,34.782609,0.169869,33.435897,27.136364,41.588235,34.25,31.17,27.33,83.75,9.2,0.0,40.0,35.6,13.5,22.4,1,5.5,17.0,1500.0,India,333.0,Batsman,1.826087


In [73]:
## adding player id's
temp = players.groupby('Player')['player_id'].apply(lambda x: x.head(1)).reset_index().drop('level_1',axis=1).rename(columns=
                                                                                                             {'Player':'batsman'})
batsman = batsman.merge(temp, on ='batsman', how='left')

In [74]:
## saving batsman and bolwer features
batsman.to_csv('saved/batsman.csv', index=False)
bowler.to_csv('saved/bowler.csv', index=False)