# Loading Data:

In [1]:
import numpy as np
import pandas as pd
import import_ipynb
from datetime import datetime, timedelta

In [2]:
from P01_Pre_Processing import matches, trimSpaceInValues, title, latest_teams

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   object 
 17  method        

In [3]:
all_matches = pd.read_csv('../data/raw/all_matches.csv')

In [4]:
all_matches = latest_teams(all_matches, ['Team 1', 'Team 2'])
all_matches = trimSpaceInValues(all_matches)
all_matches = title(all_matches)


In [5]:
all_matches.head()

Unnamed: 0,Match,Team 1,Team 2,Date,Time,Season
0,Match 1,Royal Challengers Bangalore,Kolkata Knight Riders,2008-04-18,8:00 pm,2008
1,Match 2,Punjab Kings,Chennai Super Kings,2008-04-19,4:00 pm,2008
2,Match 3,Delhi Capitals,Rajasthan Royals,2008-04-19,8:00 pm,2008
3,Match 4,Kolkata Knight Riders,Sunrisers Hyderabad,2008-04-20,4:00 pm,2008
4,Match 5,Mumbai Indians,Royal Challengers Bangalore,2008-04-20,8:00 pm,2008


In [6]:
matches.head()

Unnamed: 0,Id,Season,City,Date,Match_Type,Player_Of_Match,Venue,Team1,Team2,Toss_Winner,Toss_Decision,Winner,Result,Result_Margin,Target_Runs,Target_Overs,Super_Over,Method,Umpire1,Umpire2
0,335982,2008,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,Field,Kolkata Knight Riders,Runs,140,223,20,N,Normal,Asad Rauf,RE Koertzen
1,335983,2008,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association IS Bindra Stadium,Punjab Kings,Chennai Super Kings,Chennai Super Kings,Bat,Chennai Super Kings,Runs,33,241,20,N,Normal,MR Benson,SL Shastri
2,335984,2008,Delhi,2008-04-19,League,MF Maharoof,Arun Jaitley Stadium,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,Bat,Delhi Capitals,Wickets,9,130,20,N,Normal,Aleem Dar,GA Pratapkumar
3,335985,2008,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,Bat,Royal Challengers Bangalore,Wickets,5,166,20,N,Normal,SJ Davis,DJ Harper
4,335986,2008,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,Bat,Kolkata Knight Riders,Wickets,5,111,20,N,Normal,BF Bowden,K Hariharan


# Processing Data:

In [7]:
all_matches['Time'].unique()

array(['8:00 pm', '4:00 pm', '6:30 pm', '2:30 pm', '7:00 pm', '7:30 pm',
       '07:30 PM', '03:30 PM', '3:30 pm'], dtype=object)

In [8]:
def clean_time(time):
    if pd.isna(time):
        return None
    time = str(time).strip().lower()
    time = time.replace('.', '')
    time = time.replace(' ', '')
    time = time.replace('am', ' am').replace('pm', ' pm')
    time = time.lstrip('0') 
    return time


In [9]:
all_matches['Time'] = all_matches['Time'].apply(clean_time)

In [10]:
# After careful examination, from espncricinfo.com and iplt20.com, we figured out the Matc 24 data was missing in iplt20.com data

In [11]:
# --- Step 1: Create the new match row ---
new_row = {
    'Match': 'Match 24',
    'Team 1': 'Chennai Super Kings',
    'Team 2': 'Pune Warriors India',
    'Date': pd.to_datetime('2012-04-19'),
    'Time': '8:00 pm',
    'Season': 2012
}

In [12]:
# --- Step 2: Insert at index 275 ---
before = all_matches.iloc[:275]
after = all_matches.iloc[275:]
all_matches = pd.concat([before, pd.DataFrame([new_row]), after], ignore_index=True)

In [13]:
# --- Step 3: Extract numeric match number safely ---
all_matches['Match_Num'] = (
    all_matches['Match']
    .astype(str)
    .str.extract(r'(\d+)')[0]
)
all_matches['Match_Num'] = pd.to_numeric(all_matches['Match_Num'], errors='coerce')

In [14]:
# --- Step 4: Increment match numbers for Season 2012, >= 24, but not the new row ---
new_row_index = 275
mask = (
    (all_matches['Season'] == 2012) &
    (all_matches['Match_Num'] >= 24) &
    (all_matches.index != new_row_index)
)
all_matches.loc[mask, 'Match_Num'] = all_matches.loc[mask, 'Match_Num'] + 1

In [15]:
# --- Step 5: Rebuild safely ---
all_matches['Match_Num'] = all_matches['Match_Num'].astype('Int64')
all_matches.loc[all_matches['Match_Num'].notna(), 'Match'] = 'Match ' + all_matches['Match_Num'].astype(str)
all_matches.drop(columns=['Match_Num'], inplace=True)

In [16]:
# Due to rain, reserve day was used. Matches uses the scheduled date

In [17]:
matches.loc[matches['Id'] == 734043, 'Date'] = pd.to_datetime('2014-05-28')

In [18]:
# Step 1: Convert to datetime safely
all_matches['Date'] = pd.to_datetime(all_matches['Date'], errors='coerce')

# Step 2:Keep only the date part (drop the time component)
all_matches['Date'] = all_matches['Date'].dt.date

In [19]:
display(all_matches.loc[270:280])

Unnamed: 0,Match,Team 1,Team 2,Date,Time,Season
270,Match 19,Mumbai Indians,Delhi Capitals,2012-04-16,8:00 pm,2012
271,Match 20,Rajasthan Royals,Sunrisers Hyderabad,2012-04-17,4:00 pm,2012
272,Match 21,Royal Challengers Bangalore,Pune Warriors India,2012-04-17,8:00 pm,2012
273,Match 22,Punjab Kings,Kolkata Knight Riders,2012-04-18,8:00 pm,2012
274,Match 23,Delhi Capitals,Sunrisers Hyderabad,2012-04-19,4:00 pm,2012
275,Match 24,Chennai Super Kings,Pune Warriors India,2012-04-19,8:00 pm,2012
276,Match 25,Punjab Kings,Royal Challengers Bangalore,2012-04-20,8:00 pm,2012
277,Match 26,Chennai Super Kings,Rajasthan Royals,2012-04-21,4:00 pm,2012
278,Match 27,Delhi Capitals,Pune Warriors India,2012-04-21,8:00 pm,2012
279,Match 28,Mumbai Indians,Punjab Kings,2012-04-22,4:00 pm,2012


# Concatenating:

In [20]:
all_matches['Date'] = pd.to_datetime(all_matches['Date'])
all_matches['match_key'] = all_matches.apply(lambda x: tuple(sorted([x['Team 1'], x['Team 2']])), axis=1)
all_matches.rename(columns={'Match': 'Match_No', 'Time': 'Time'}, inplace=True)

In [21]:
matches['Date'] = pd.to_datetime(matches['Date'])
matches['match_key'] = matches.apply(lambda x: tuple(sorted([x['Team1'], x['Team2']])), axis=1)

In [22]:
def merge_match_data(all_matches = all_matches, matches = matches):
    """Merge both datasets using Season, Date, and team match_key."""
    
    merged = pd.merge(
        matches,
        all_matches[['Season', 'Date', 'match_key', 'Time', 'Match_No']],
        on=['Season', 'Date', 'match_key'],
        how='left',
        indicator=True
    )
    
    return merged

In [23]:
def find_unmatched_rows(merged, all_matches = all_matches, matches = matches):
    """Find rows that didn't match in either dataset."""
    used_all = merged.dropna(subset=['Match_No'])[['Season', 'Match_No', 'Date', 'match_key']]
    
    # Unmatched from matches
    unmatched_matches = merged[merged['_merge'] == 'left_only']
    
    # Unmatched from all_matches
    all_matches_keys = all_matches[['Season', 'Match_No', 'Date', 'match_key']]
    used_keys = used_all[['Season', 'Match_No', 'Date', 'match_key']]
    unmatched_all_matches = all_matches_keys.merge(used_keys, on=['Season', 'Match_No', 'Date', 'match_key'], how='left', indicator=True)
    unmatched_all_matches = unmatched_all_matches[unmatched_all_matches['_merge'] == 'left_only']

    unmatched_matches = pd.DataFrame(unmatched_matches)
    unmatched_all_matches = pd.DataFrame(unmatched_all_matches)
    
    return unmatched_matches, unmatched_all_matches

In [24]:
matches = merge_match_data(all_matches, matches)

unmatched_matches, unmatched_all_matches = find_unmatched_rows(matches, all_matches, matches)

In [25]:
print("✅ Merged dataset shape:", matches.shape)
print("❌ Unmatched in matches:", len(unmatched_matches))
print("❌ Unmatched in all_matches:", len(unmatched_all_matches))

✅ Merged dataset shape: (1095, 24)
❌ Unmatched in matches: 0
❌ Unmatched in all_matches: 11


In [26]:
for i in range(2008, 2025):
    count = len(all_matches[all_matches['Season'] == i])
    print(f"{i}: {count} matches")

2008: 59 matches
2009: 59 matches
2010: 60 matches
2011: 74 matches
2012: 76 matches
2013: 76 matches
2014: 60 matches
2015: 60 matches
2016: 60 matches
2017: 60 matches
2018: 60 matches
2019: 60 matches
2020: 60 matches
2021: 60 matches
2022: 74 matches
2023: 74 matches
2024: 74 matches


In [27]:
unmatched_all_matches

Unnamed: 0,Season,Match_No,Date,match_key,_merge
46,2008,Match 47,2008-05-22,"(Delhi Capitals, Kolkata Knight Riders)",left_only
65,2009,Match 7,2009-04-21,"(Mumbai Indians, Rajasthan Royals)",left_only
71,2009,Match 13,2009-04-25,"(Chennai Super Kings, Kolkata Knight Riders)",left_only
197,2011,Match 20,2011-04-19,"(Rajasthan Royals, Royal Challengers Bangalore)",left_only
283,2012,Match 32,2012-04-24,"(Kolkata Knight Riders, Sunrisers Hyderabad)",left_only
285,2012,Match 34,2012-04-25,"(Chennai Super Kings, Royal Challengers Bangal...",left_only
488,2015,Match 25,2015-04-26,"(Kolkata Knight Riders, Rajasthan Royals)",left_only
612,2017,Match 29,2017-04-25,"(Royal Challengers Bangalore, Sunrisers Hydera...",left_only
1094,2024,Match 63,2024-05-13,"(Gujarat Titans, Kolkata Knight Riders)",left_only
1097,2024,Match 66,2024-05-16,"(Gujarat Titans, Sunrisers Hyderabad)",left_only


# Local Time:

In [28]:
matches['City'].unique()

array(['Bangalore', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
       'Abu Dhabi', 'Sharjah', 'Dubai', 'Rajkot', 'Kanpur', 'Bengaluru',
       'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali'], dtype=object)

In [29]:
matches['Time'].unique()

array(['8:00 pm', '4:00 pm', '6:30 pm', '2:30 pm', '7:00 pm', '7:30 pm',
       '3:30 pm'], dtype=object)

In [30]:
city_to_offset = {
    # India (no offset)
    'Bangalore': 0, 'Chandigarh': 0, 'Delhi': 0, 'Mumbai': 0, 'Kolkata': 0,
    'Jaipur': 0, 'Hyderabad': 0, 'Chennai': 0, 'Ahmedabad': 0, 'Cuttack': 0,
    'Nagpur': 0, 'Dharamsala': 0, 'Kochi': 0, 'Indore': 0, 'Visakhapatnam': 0,
    'Pune': 0, 'Raipur': 0, 'Ranchi': 0, 'Rajkot': 0, 'Kanpur': 0,
    'Bengaluru': 0, 'Navi Mumbai': 0, 'Lucknow': 0, 'Guwahati': 0, 'Mohali': 0,

    # South Africa (2009)
    'Cape Town': -3.5, 'Port Elizabeth': -3.5, 'Durban': -3.5,
    'Centurion': -3.5, 'East London': -3.5, 'Johannesburg': -3.5,
    'Kimberley': -3.5, 'Bloemfontein': -3.5,

    # UAE (2014, 2020, 2021)
    'Abu Dhabi': -1.5, 'Dubai': -1.5, 'Sharjah': -1.5
}

In [31]:
def ist_to_local(time_str, city):
    if pd.isna(time_str):
        return None
    
    # normalize string
    time_str = str(time_str).strip().lower()
    offset = city_to_offset.get(city, 0)
    
    try:
        time = datetime.strptime(time_str, "%I:%M %p")
        local_t = time + timedelta(hours=offset)
        # Use %I (zero-padded), then strip leading zeros manually for Windows compatibility
        return local_t.strftime("%I:%M %p").lstrip("0").lower()
    except Exception as e:
        print(f"⚠️ Error parsing time '{time_str}' for city '{city}': {e}")
        return None

In [32]:
matches['Time'] = matches.apply(lambda x: ist_to_local(x['Time'], x['City']), axis=1)

In [33]:
matches

Unnamed: 0,Id,Season,City,Date,Match_Type,Player_Of_Match,Venue,Team1,Team2,Toss_Winner,...,Target_Runs,Target_Overs,Super_Over,Method,Umpire1,Umpire2,match_key,Time,Match_No,_merge
0,335982,2008,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,...,223,20,N,Normal,Asad Rauf,RE Koertzen,"(Kolkata Knight Riders, Royal Challengers Bang...",8:00 pm,Match 1,both
1,335983,2008,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association IS Bindra Stadium,Punjab Kings,Chennai Super Kings,Chennai Super Kings,...,241,20,N,Normal,MR Benson,SL Shastri,"(Chennai Super Kings, Punjab Kings)",4:00 pm,Match 2,both
2,335984,2008,Delhi,2008-04-19,League,MF Maharoof,Arun Jaitley Stadium,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,...,130,20,N,Normal,Aleem Dar,GA Pratapkumar,"(Delhi Capitals, Rajasthan Royals)",8:00 pm,Match 3,both
3,335985,2008,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,...,166,20,N,Normal,SJ Davis,DJ Harper,"(Mumbai Indians, Royal Challengers Bangalore)",8:00 pm,Match 5,both
4,335986,2008,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,...,111,20,N,Normal,BF Bowden,K Hariharan,"(Kolkata Knight Riders, Sunrisers Hyderabad)",4:00 pm,Match 4,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,1426307,2024,Hyderabad,2024-05-19,League,Abhishek Sharma,Rajiv Gandhi International Stadium,Punjab Kings,Sunrisers Hyderabad,Punjab Kings,...,215,20,N,Normal,Nitin Menon,VK Sharma,"(Punjab Kings, Sunrisers Hyderabad)",3:30 pm,Match 69,both
1091,1426309,2024,Ahmedabad,2024-05-21,Qualifier 1,MA Starc,Narendra Modi Stadium,Sunrisers Hyderabad,Kolkata Knight Riders,Sunrisers Hyderabad,...,160,20,N,Normal,AK Chaudhary,R Pandit,"(Kolkata Knight Riders, Sunrisers Hyderabad)",7:30 pm,Match 1,both
1092,1426310,2024,Ahmedabad,2024-05-22,Eliminator,R Ashwin,Narendra Modi Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,...,173,20,N,Normal,KN Ananthapadmanabhan,MV Saidharshan Kumar,"(Rajasthan Royals, Royal Challengers Bangalore)",7:30 pm,Eliminator,both
1093,1426311,2024,Chennai,2024-05-24,Qualifier 2,Shahbaz Ahmed,MA Chidambaram Stadium,Sunrisers Hyderabad,Rajasthan Royals,Rajasthan Royals,...,176,20,N,Normal,Nitin Menon,VK Sharma,"(Rajasthan Royals, Sunrisers Hyderabad)",7:30 pm,Match 2,both
