# Loading Data:

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.float_format', lambda x: '{:.0f}'.format(x) if x == int(x) else '{:g}'.format(x))

In [3]:
matches = pd.read_csv('../data/raw/matches.csv')
deliveries = pd.read_csv('../data/raw/deliveries.csv')

# Matches Data Cleaning:

In [4]:
matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140,223,20,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33,241,20,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9,130,20,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5,166,20,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5,111,20,N,,BF Bowden,K Hariharan


In [5]:
matches.describe()

Unnamed: 0,id,result_margin,target_runs,target_overs
count,1095.0,1076.0,1092.0,1092.0
mean,904828.0,17.2593,165.684,19.7593
std,367740.0,21.7874,33.427,1.58111
min,335982.0,1.0,43.0,5.0
25%,548332.0,6.0,146.0,20.0
50%,980961.0,8.0,166.0,20.0
75%,1254060.0,20.0,187.0,20.0
max,1426312.0,146.0,288.0,20.0


In [6]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   object 
 17  method        

In [7]:
def latest_teams(df, cols):
    # Mapping old to latest
    team_name_map = {
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Delhi Daredevils': 'Delhi Capitals',
        'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
        'Kings XI Punjab': 'Punjab Kings',
        'Rising Pune Supergiants': 'Rising Pune Supergiant',
        'Pune Warriors': 'Pune Warriors India'
    }

    # Replace old team names with the latest names
    for col in cols:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found in DataFrame")
        df[col] = df[col].replace(team_name_map)

    return df

In [8]:
def unique_stadium(matches_df):
    venue_map = {
        'Arun Jaitley Stadium, Delhi': 'Arun Jaitley Stadium',
        'Brabourne Stadium, Mumbai': 'Brabourne Stadium',
        'Dr DY Patil Sports Academy, Mumbai': 'Dr DY Patil Sports Academy',
        'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam': 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
        'Eden Gardens, Kolkata': 'Eden Gardens',
        'Himachal Pradesh Cricket Association Stadium, Dharamsala': 'Himachal Pradesh Cricket Association Stadium',
        'M.Chinnaswamy Stadium': 'M Chinnaswamy Stadium',
        'M Chinnaswamy Stadium, Bengaluru': 'M Chinnaswamy Stadium',
        'M Chinnaswamy Stadium, Bengalore': 'M Chinnaswamy Stadium',
        'MA Chidambaram Stadium, Chepauk': 'MA Chidambaram Stadium',
        'MA Chidambaram Stadium, Chepauk, Chennai': 'MA Chidambaram Stadium',
        'Maharashtra Cricket Association Stadium, Pune': 'Maharashtra Cricket Association Stadium',
        'Punjab Cricket Association Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh': 'Punjab Cricket Association IS Bindra Stadium',
        'Rajiv Gandhi International Stadium, Uppal': 'Rajiv Gandhi International Stadium',
        'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 'Rajiv Gandhi International Stadium',
        'Sawai Mansingh Stadium, Jaipur': 'Sawai Mansingh Stadium',
        'Wankhede Stadium, Mumbai': 'Wankhede Stadium',
        'Feroz Shah Kotla': 'Arun Jaitley Stadium',
        'Zayed Cricket Stadium, Abu Dhabi': 'Sheikh Zayed Stadium',
        'Sardar Patel Stadium, Motera': 'Narendra Modi Stadium',
        'Narendra Modi Stadium, Ahmedabad': 'Narendra Modi Stadium',
        'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium',
        'Barsapara Cricket Stadium, Guwahati': 'Barsapara Cricket Stadium',
        'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur': 'Maharaja Yadavindra Singh International Cricket Stadium',
        'Vidarbha Cricket Association Stadium, Jamtha': 'Vidarbha Cricket Association Stadium'
    }
    matches_df['venue'] = matches_df['venue'].replace(venue_map)

In [9]:
def trimSpaceInValues(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()
    return df

In [10]:
def title(matches):
  matches.columns = matches.columns.str.strip()
  matches.columns = matches.columns.str.title()
  for i in matches.columns:
    if i not in ['Umpire1', 'Umpire2', 'Player_Of_Match', 'Venue', 'Batter', 'Bowler',	'Non_Striker', 'Time']:
      if matches[i].dtype == 'object':
        matches[i] = matches[i].str.strip().str.title()
  return matches

In [11]:
matches = latest_teams(matches, ['team1', 'team2', 'toss_winner', 'winner'])
unique_stadium(matches)
matches = trimSpaceInValues(matches)
matches = title(matches)

In [12]:
matches.head()

Unnamed: 0,Id,Season,City,Date,Match_Type,Player_Of_Match,Venue,Team1,Team2,Toss_Winner,Toss_Decision,Winner,Result,Result_Margin,Target_Runs,Target_Overs,Super_Over,Method,Umpire1,Umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,Field,Kolkata Knight Riders,Runs,140,223,20,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association IS Bindra Stadium,Punjab Kings,Chennai Super Kings,Chennai Super Kings,Bat,Chennai Super Kings,Runs,33,241,20,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Arun Jaitley Stadium,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,Bat,Delhi Capitals,Wickets,9,130,20,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,Bat,Royal Challengers Bangalore,Wickets,5,166,20,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,Bat,Kolkata Knight Riders,Wickets,5,111,20,N,,BF Bowden,K Hariharan


In [13]:
all_teams = np.union1d(matches['Team1'].unique(), matches['Team2'].unique())
all_teams

array(['Chennai Super Kings', 'Delhi Capitals', 'Gujarat Lions',
       'Gujarat Titans', 'Kochi Tuskers Kerala', 'Kolkata Knight Riders',
       'Lucknow Super Giants', 'Mumbai Indians', 'Pune Warriors India',
       'Punjab Kings', 'Rajasthan Royals', 'Rising Pune Supergiant',
       'Royal Challengers Bangalore', 'Sunrisers Hyderabad'], dtype=object)

In [14]:
all_venues = matches['Venue'].unique()
all_venues

array(['M Chinnaswamy Stadium',
       'Punjab Cricket Association IS Bindra Stadium',
       'Arun Jaitley Stadium', 'Wankhede Stadium', 'Eden Gardens',
       'Sawai Mansingh Stadium', 'Rajiv Gandhi International Stadium',
       'MA Chidambaram Stadium', 'Dr DY Patil Sports Academy', 'Newlands',
       "St George's Park", 'Kingsmead', 'SuperSport Park', 'Buffalo Park',
       'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium', 'Narendra Modi Stadium',
       'Barabati Stadium', 'Vidarbha Cricket Association Stadium',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Holkar Cricket Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Maharashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium', 'Dubai Intern

In [15]:
matches['Date'] = pd.to_datetime(matches['Date'])
matches['Date'] = matches['Date'].dt.date
matches['Date'][0]

datetime.date(2008, 4, 18)

In [16]:
matches['Date'] = pd.to_datetime(matches['Date'])
matches['Season'] = matches['Date'].dt.year

In [17]:
matches['Date'] = matches['Date'].dt.date
matches.head()

Unnamed: 0,Id,Season,City,Date,Match_Type,Player_Of_Match,Venue,Team1,Team2,Toss_Winner,Toss_Decision,Winner,Result,Result_Margin,Target_Runs,Target_Overs,Super_Over,Method,Umpire1,Umpire2
0,335982,2008,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,Field,Kolkata Knight Riders,Runs,140,223,20,N,,Asad Rauf,RE Koertzen
1,335983,2008,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association IS Bindra Stadium,Punjab Kings,Chennai Super Kings,Chennai Super Kings,Bat,Chennai Super Kings,Runs,33,241,20,N,,MR Benson,SL Shastri
2,335984,2008,Delhi,2008-04-19,League,MF Maharoof,Arun Jaitley Stadium,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,Bat,Delhi Capitals,Wickets,9,130,20,N,,Aleem Dar,GA Pratapkumar
3,335985,2008,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,Bat,Royal Challengers Bangalore,Wickets,5,166,20,N,,SJ Davis,DJ Harper
4,335986,2008,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,Bat,Kolkata Knight Riders,Wickets,5,111,20,N,,BF Bowden,K Hariharan


In [18]:
matches.isna().sum()

Id                    0
Season                0
City                 51
Date                  0
Match_Type            0
Player_Of_Match       5
Venue                 0
Team1                 0
Team2                 0
Toss_Winner           0
Toss_Decision         0
Winner                5
Result                0
Result_Margin        19
Target_Runs           3
Target_Overs          3
Super_Over            0
Method             1074
Umpire1               0
Umpire2               0
dtype: int64

In [19]:
matches[matches['City'].isna()]['Venue'].unique()

array(['Sharjah Cricket Stadium', 'Dubai International Cricket Stadium'],
      dtype=object)

In [20]:
def missing_city(matches):
  missing = matches['City'].isna()
  matches.loc[missing, 'City'] = matches.loc[missing, 'Venue'].str.split().str[0]
  return matches

In [21]:
matches = missing_city(matches)

In [22]:
def handle_missing_values_matches(matches):
    matches['Player_Of_Match'] = matches['Player_Of_Match'].fillna('No Result')
    matches['Winner'] = matches['Winner'].fillna('No Result')
    matches['Result_Margin'] = matches['Result_Margin'].fillna('No Result')
    matches['Target_Runs'] = matches['Target_Runs'].fillna('No Result')
    matches['Target_Overs'] = matches['Target_Overs'].fillna('No Result')
    matches['Method'] = matches['Method'].fillna('Normal')
    return matches

In [23]:
matches = handle_missing_values_matches(matches)

In [24]:
matches.isna().sum()

Id                 0
Season             0
City               0
Date               0
Match_Type         0
Player_Of_Match    0
Venue              0
Team1              0
Team2              0
Toss_Winner        0
Toss_Decision      0
Winner             0
Result             0
Result_Margin      0
Target_Runs        0
Target_Overs       0
Super_Over         0
Method             0
Umpire1            0
Umpire2            0
dtype: int64

# Deliveries Data Cleaning:

In [25]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [26]:
deliveries.describe()

Unnamed: 0,match_id,inning,over,ball,batsman_runs,extra_runs,total_runs,is_wicket
count,260920,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0
mean,907067,1.48353,9.19768,3.62449,1.265,0.0678062,1.33281,0.0496321
std,367991,0.502643,5.68348,1.81492,1.6393,0.343265,1.62642,0.217184
min,335982,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,548334,1.0,4.0,2.0,0.0,0.0,0.0,0.0
50%,980967,1.0,9.0,4.0,1.0,0.0,1.0,0.0
75%,1254066,2.0,14.0,5.0,1.0,0.0,1.0,0.0
max,1426312,6.0,19.0,11.0,6.0,7.0,7.0,1.0


In [27]:
deliveries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder           9354 non-null    obj

In [28]:
# The missing data is here is associated with the domain knowledge and does not introduce bias in the data and the data is ready to go

In [29]:
deliveries = latest_teams(deliveries, ['batting_team', 'bowling_team'])
deliveries = trimSpaceInValues(deliveries)
deliveries = title(deliveries)