# Loading Data:

In [None]:
import numpy as np
import pandas as pd
import import_ipynb
from datetime import datetime, timedelta

In [None]:
from P01_Pre_Processing import matches, trimSpaceInValues, title, latest_teams

In [None]:
all_matches = pd.read_csv('../data/raw/all_matches.csv')

In [None]:
all_matches = latest_teams(all_matches, ['Team 1', 'Team 2'])
all_matches = trimSpaceInValues(all_matches)
all_matches = title(all_matches)


In [None]:
all_matches.head()

In [None]:
matches.head()

# Processing Data:

In [None]:
all_matches['Time'].unique()

In [None]:
def clean_time(time):
    if pd.isna(time):
        return None
    time = str(time).strip().lower()
    time = time.replace('.', '')
    time = time.replace(' ', '')
    time = time.replace('am', ' am').replace('pm', ' pm')
    time = time.lstrip('0') 
    return time


In [None]:
all_matches['Time'] = all_matches['Time'].apply(clean_time)

In [None]:
# After careful examination, from espncricinfo.com and iplt20.com, we figured out the Matc 24 data was missing in iplt20.com data

In [None]:
# --- Step 1: Create the new match row ---
new_row = {
    'Match': 'Match 24',
    'Team 1': 'Chennai Super Kings',
    'Team 2': 'Pune Warriors India',
    'Date': pd.to_datetime('2012-04-19'),
    'Time': '8:00 pm',
    'Season': 2012
}

In [None]:
# --- Step 2: Insert at index 275 ---
before = all_matches.iloc[:275]
after = all_matches.iloc[275:]
all_matches = pd.concat([before, pd.DataFrame([new_row]), after], ignore_index=True)

In [None]:
# --- Step 3: Extract numeric match number safely ---
all_matches['Match_Num'] = (
    all_matches['Match']
    .astype(str)
    .str.extract(r'(\d+)')[0]
)
all_matches['Match_Num'] = pd.to_numeric(all_matches['Match_Num'], errors='coerce')

In [None]:
# --- Step 4: Increment match numbers for Season 2012, >= 24, but not the new row ---
new_row_index = 275
mask = (
    (all_matches['Season'] == 2012) &
    (all_matches['Match_Num'] >= 24) &
    (all_matches.index != new_row_index)
)
all_matches.loc[mask, 'Match_Num'] = all_matches.loc[mask, 'Match_Num'] + 1

In [None]:
# --- Step 5: Rebuild safely ---
all_matches['Match_Num'] = all_matches['Match_Num'].astype('Int64')
all_matches.loc[all_matches['Match_Num'].notna(), 'Match'] = 'Match ' + all_matches['Match_Num'].astype(str)
all_matches.drop(columns=['Match_Num'], inplace=True)

In [None]:
# Due to rain, reserve day was used. Matches uses the scheduled date

In [None]:
matches.loc[matches['Id'] == 734043, 'Date'] = pd.to_datetime('2014-05-28')

In [None]:
# Step 1: Convert to datetime safely
all_matches['Date'] = pd.to_datetime(all_matches['Date'], errors='coerce')

# Step 2:Keep only the date part (drop the time component)
all_matches['Date'] = all_matches['Date'].dt.date

In [None]:
display(all_matches.loc[270:280])

# Concatenating:

In [None]:
all_matches['Date'] = pd.to_datetime(all_matches['Date'])
all_matches['match_key'] = all_matches.apply(lambda x: tuple(sorted([x['Team 1'], x['Team 2']])), axis=1)
all_matches.rename(columns={'Match': 'Match_No', 'Time': 'Time'}, inplace=True)

In [None]:
matches['Date'] = pd.to_datetime(matches['Date'])
matches['match_key'] = matches.apply(lambda x: tuple(sorted([x['Team1'], x['Team2']])), axis=1)

In [None]:
def merge_match_data(all_matches = all_matches, matches = matches):
    """Merge both datasets using Season, Date, and team match_key."""
    
    merged = pd.merge(
        matches,
        all_matches[['Season', 'Date', 'match_key', 'Time', 'Match_No']],
        on=['Season', 'Date', 'match_key'],
        how='left',
        indicator=True
    )
    
    return merged

In [None]:
def find_unmatched_rows(merged, all_matches = all_matches, matches = matches):
    """Find rows that didn't match in either dataset."""
    used_all = merged.dropna(subset=['Match_No'])[['Season', 'Date', 'match_key']]
    
    # Unmatched from matches
    unmatched_matches = merged[merged['_merge'] == 'left_only']
    
    # Unmatched from all_matches
    all_matches_keys = all_matches[['Season', 'Date', 'match_key']]
    used_keys = used_all[['Season', 'Date', 'match_key']]
    unmatched_all_matches = all_matches_keys.merge(used_keys, on=['Season', 'Date', 'match_key'], how='left', indicator=True)
    unmatched_all_matches = unmatched_all_matches[unmatched_all_matches['_merge'] == 'left_only']

    unmatched_matches = pd.DataFrame(unmatched_matches)
    unmatched_all_matches = pd.DataFrame(unmatched_all_matches)
    
    return unmatched_matches, unmatched_all_matches

In [None]:
matches = merge_match_data(all_matches, matches)

unmatched_matches, unmatched_all_matches = find_unmatched_rows(matches, all_matches, matches)

In [None]:
print("✅ Merged dataset shape:", matches.shape)
print("❌ Unmatched in matches:", len(unmatched_matches))
print("❌ Unmatched in all_matches:", len(unmatched_all_matches))

In [None]:
for i in range(2008, 2025):
    count = len(all_matches[all_matches['Season'] == i])
    print(f"{i}: {count} matches")

In [None]:
unmatched_all_matches

# Local Time:

In [None]:
matches['City'].unique()

In [None]:
matches['Time'].unique()

In [None]:
city_to_offset = {
    # India (no offset)
    'Bangalore': 0, 'Chandigarh': 0, 'Delhi': 0, 'Mumbai': 0, 'Kolkata': 0,
    'Jaipur': 0, 'Hyderabad': 0, 'Chennai': 0, 'Ahmedabad': 0, 'Cuttack': 0,
    'Nagpur': 0, 'Dharamsala': 0, 'Kochi': 0, 'Indore': 0, 'Visakhapatnam': 0,
    'Pune': 0, 'Raipur': 0, 'Ranchi': 0, 'Rajkot': 0, 'Kanpur': 0,
    'Bengaluru': 0, 'Navi Mumbai': 0, 'Lucknow': 0, 'Guwahati': 0, 'Mohali': 0,

    # South Africa (2009)
    'Cape Town': -3.5, 'Port Elizabeth': -3.5, 'Durban': -3.5,
    'Centurion': -3.5, 'East London': -3.5, 'Johannesburg': -3.5,
    'Kimberley': -3.5, 'Bloemfontein': -3.5,

    # UAE (2014, 2020, 2021)
    'Abu Dhabi': -1.5, 'Dubai': -1.5, 'Sharjah': -1.5
}

In [None]:
def ist_to_local(time_str, city):
    if pd.isna(time_str):
        return None
    
    # normalize string
    time_str = str(time_str).strip().lower()
    offset = city_to_offset.get(city, 0)
    
    try:
        time = datetime.strptime(time_str, "%I:%M %p")
        local_t = time + timedelta(hours=offset)
        # Use %I (zero-padded), then strip leading zeros manually for Windows compatibility
        return local_t.strftime("%I:%M %p").lstrip("0").lower()
    except Exception as e:
        print(f"⚠️ Error parsing time '{time_str}' for city '{city}': {e}")
        return None

In [None]:
matches['Time'] = matches.apply(lambda x: ist_to_local(x['Time'], x['City']), axis=1)