Exploring and Cleaning Deliveries

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [62]:
df = pd.read_csv("../Data/deliveries.csv")

In [63]:
df.shape

(225792, 12)

In [64]:
df.columns

Index(['batsman', 'bowler', 'over', 'ball', 'total_runs', 'batsman_runs',
       'extras', 'match_no', 'is_wicket', 'dismissal_type', 'fielder',
       'innings'],
      dtype='object')

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225792 entries, 0 to 225791
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   batsman         225792 non-null  object 
 1   bowler          225792 non-null  object 
 2   over            225792 non-null  int64  
 3   ball            225792 non-null  int64  
 4   total_runs      225792 non-null  int64  
 5   batsman_runs    225792 non-null  int64  
 6   extras          225792 non-null  int64  
 7   match_no        225792 non-null  int64  
 8   is_wicket       225792 non-null  float64
 9   dismissal_type  11122 non-null   object 
 10  fielder         8036 non-null    object 
 11  innings         225792 non-null  int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 20.7+ MB


dismissal_Type and fielder have null values

In [66]:
df.describe()

Unnamed: 0,over,ball,total_runs,batsman_runs,extras,match_no,is_wicket,innings
count,225792.0,225792.0,225792.0,225792.0,225792.0,225792.0,225792.0,225792.0
mean,9.192195,3.610983,1.310077,1.243498,0.066579,476.825778,0.049262,1.847227
std,5.678512,1.80364,1.605784,1.617903,0.340369,274.666293,0.216415,0.35977
min,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,4.0,2.0,0.0,0.0,0.0,239.0,0.0,2.0
50%,9.0,4.0,1.0,1.0,0.0,476.0,0.0,2.0
75%,14.0,5.0,1.0,1.0,0.0,716.0,0.0,2.0
max,19.0,10.0,7.0,6.0,7.0,950.0,1.0,2.0


In [67]:
df.columns

Index(['batsman', 'bowler', 'over', 'ball', 'total_runs', 'batsman_runs',
       'extras', 'match_no', 'is_wicket', 'dismissal_type', 'fielder',
       'innings'],
      dtype='object')

In [68]:
df["dismissal_type"].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'runout', 'retired out'], dtype=object)

In [69]:
df[df["dismissal_type"] == "runout"] = "run out"

In [70]:
df[(df["dismissal_type"] == "retired hurt")|(df["dismissal_type"] == "retired out")] = "retired"

In [71]:
df["dismissal_type"].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired', 'stumped',
       'caught and bowled', 'hit wicket', 'obstructing the field'],
      dtype=object)

In [78]:
df["batsman"].unique()

array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'AC Gilchrist',
       'Y Venugopal Rao', 'VVS Laxman', 'A Symonds', 'RG Sharma',
       'SB Styris', 'AS Yadav', 'SB Bangar', 'WPUJC Vaas', 'RP Singh',
       'WP Saha', 'LR Shukla', 'L Ronchi', 'ST Jayasuriya', 'DJ Thornely',
       'retired', 'RV Uthappa', 'PR Shah', 'AM Nayar', 'SM Pollock',
       'Harbhajan Singh', 'S Chanderpaul', 'LRPL Taylor',
       'DPMD Jayawardene', 'S Sohal', 'B Le

In [79]:
df["bowler"].unique()

array(['P Kumar', 'Z Khan', 'AA Noffke', 'JH Kallis', 'SB Joshi',
       'CL White', 'AB Dinda', 'I Sharma', 'AB Agarkar', 'SC Ganguly',
       'LR Shukla', 'B Lee', 'S Sreesanth', 'JR Hopes', 'IK Pathan',
       'K Goel', 'PP Chawla', 'WA Mota', 'JDP Oram', 'MS Gony',
       'M Muralitharan', 'P Amarnath', 'Joginder Sharma', 'GD McGrath',
       'B Geeves', 'MF Maharoof', 'R Bhatia', 'DL Vettori', 'MM Patel',
       'SR Watson', 'SK Trivedi', 'SK Warne', 'YK Pathan', 'D Salunkhe',
       'M Kartik', 'Mohammad Hafeez', 'DJ Hussey', 'WPUJC Vaas',
       'RP Singh', 'SB Styris', 'SB Bangar', 'A Symonds', 'PP Ojha',
       'retired', 'R Vinay Kumar', 'B Akhil', 'A Nehra', 'SM Pollock',
       'DS Kulkarni', 'ST Jayasuriya', 'Harbhajan Singh', 'AM Nayar',
       'Pankaj Singh', 'Mohammad Asif', 'VY Mahesh', 'Shahid Afridi',
       'DJ Bravo', 'VS Yeligati', 'MA Khote', 'D Kalyankrishna',
       'VRV Singh', 'Sohail Tanvir', 'A Kumble', 'DNT Zoysa',
       'SD Chitnis', 'Yuvraj Singh', 'Sho

In [72]:
df["fielder"].unique()

array([nan, "['JH Kallis']", "['P Kumar']", ..., 'Akash Deep',
       'TL Seifert', 'TU Deshpande'], dtype=object)

In [73]:
import re

In [74]:
df["fielder"] = df["fielder"].fillna("")

In [75]:
df["fielder"] = df["fielder"].apply(lambda x: re.sub("[^a-zA-Z]", "", x))

In [76]:
df["fielder"].unique()

array(['', 'JHKallis', 'PKumar', 'CLWhite', 'MKartik', 'RTPonting',
       'WPSaha', 'ABAgarkarWPSaha', 'BBMcCullum', 'KCSangakkara',
       'IKPathan', 'PAPatel', 'JoginderSharma', 'PAmarnath', 'MManhas',
       'BGeeves', 'LRShukla', 'MohammadHafeez', 'ABDinda', 'DJHussey',
       'SCGanguly', 'MohammadHafeezWPSaha', 'RGSharma', 'SBStyris',
       'ASymonds', 'VKohli', 'retired', 'ZKhan', 'MVBoucher', 'AMNayar',
       'LRonchi', 'KamranAkmal', 'RAJadejaKamranAkmal', 'PPChawla',
       'SDhawan', 'KDKarthik', 'MKTiwaryKDKarthik', 'GDMcGrath',
       'GGambhir', 'ACGilchrist', 'DJBravo', 'HarbhajanSingh',
       'SBadrinath', 'JDPOram', 'SKRaina', 'RAJadeja', 'DSalunkhe',
       'SBBangar', 'DKalyankrishna', 'VVSLaxmanRPSingh', 'DSKulkarni',
       'DJBravoLRonchi', 'AMRahanesub', 'ANehra', 'HarbhajanSinghLRonchi',
       'BLee', 'RRSarwan', 'YuvrajSingh', 'MMuralitharan',
       'MSDhoniJoginderSharma', 'ISharma', 'MMPatel', 'GCSmith', 'MRawat',
       'SohailTanvirMRawat', 'MRawatSR

In [77]:
df.to_csv("../Data/deliveries.csv", index = False)