# Data Preprocessing

This notebook cleans the raw IPL data and saves it as parquet files in `data/processed`.

In [1]:
import pandas as pd
import numpy as np
import os

# Ensure the processed data directory exists
os.makedirs('../data/processed', exist_ok=True)

## Load Data

In [2]:
matches = pd.read_csv('../data/raw/matches.csv')
deliveries = pd.read_csv('../data/raw/deliveries.csv')

## Define Cleaning Functions

In [3]:
def trim_and_title(df):
    # Standardize column names
    df.columns = df.columns.str.strip().str.title()
    
    # Standardize string values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.title()
    return df

def standardize_teams(df, cols):
    team_name_map = {
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Delhi Daredevils': 'Delhi Capitals',
        'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
        'Kings XI Punjab': 'Punjab Kings',
        'Rising Pune Supergiants': 'Rising Pune Supergiant',
        'Pune Warriors': 'Pune Warriors India'
    }
    for col in cols:
        if col in df.columns:
             df[col] = df[col].replace(team_name_map)
    return df

def standardize_venues(df):
    venue_map = {
        'Arun Jaitley Stadium, Delhi': 'Arun Jaitley Stadium',
        'Brabourne Stadium, Mumbai': 'Brabourne Stadium',
        'Dr DY Patil Sports Academy, Mumbai': 'Dr DY Patil Sports Academy',
        'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam': 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
        'Eden Gardens, Kolkata': 'Eden Gardens',
        'Himachal Pradesh Cricket Association Stadium, Dharamsala': 'Himachal Pradesh Cricket Association Stadium',
        'M.Chinnaswamy Stadium': 'M Chinnaswamy Stadium',
        'M Chinnaswamy Stadium, Bengaluru': 'M Chinnaswamy Stadium',
        'M Chinnaswamy Stadium, Bengalore': 'M Chinnaswamy Stadium',
        'MA Chidambaram Stadium, Chepauk': 'MA Chidambaram Stadium',
        'MA Chidambaram Stadium, Chepauk, Chennai': 'MA Chidambaram Stadium',
        'Maharashtra Cricket Association Stadium, Pune': 'Maharashtra Cricket Association Stadium',
        'Punjab Cricket Association Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
        'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh': 'Punjab Cricket Association IS Bindra Stadium',
        'Rajiv Gandhi International Stadium, Uppal': 'Rajiv Gandhi International Stadium',
        'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 'Rajiv Gandhi International Stadium',
        'Sawai Mansingh Stadium, Jaipur': 'Sawai Mansingh Stadium',
        'Wankhede Stadium, Mumbai': 'Wankhede Stadium',
        'Feroz Shah Kotla': 'Arun Jaitley Stadium',
        'Zayed Cricket Stadium, Abu Dhabi': 'Sheikh Zayed Stadium',
        'Sardar Patel Stadium, Motera': 'Narendra Modi Stadium',
        'Narendra Modi Stadium, Ahmedabad': 'Narendra Modi Stadium',
        'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium',
        'Barsapara Cricket Stadium, Guwahati': 'Barsapara Cricket Stadium',
        'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur': 'Maharaja Yadavindra Singh International Cricket Stadium',
        'Vidarbha Cricket Association Stadium, Jamtha': 'Vidarbha Cricket Association Stadium'
    }
    if 'Venue' in df.columns:
        df['Venue'] = df['Venue'].replace(venue_map)
    return df

def handle_missing_city(df):
    if 'City' in df.columns and 'Venue' in df.columns:
        missing = df['City'].isna()
        df.loc[missing, 'City'] = df.loc[missing, 'Venue'].str.split().str[0]
    return df

def handle_missing_values_matches(df):
    # Create copies so we don't modify the original dataframe in place unexpectedly
    df = df.copy()

    # Fill categorical columns with 'No Result'
    categorical_defaults = {
        'Player_Of_Match': 'No Result',
        'Winner': 'No Result',
        'Method': 'Normal'
    }
    df = df.fillna(categorical_defaults)

    # Fill numeric columns with -1 to avoid mixed types (int/float with string 'No Result')
    numeric_cols = ['Result_Margin', 'Target_Runs', 'Target_Overs']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].fillna(-1)

    return df

## Process Matches Data

In [4]:
matches = trim_and_title(matches)
matches = standardize_teams(matches, ['Team1', 'Team2', 'Toss_Winner', 'Winner'])
matches = standardize_venues(matches)
matches = handle_missing_city(matches)
matches = handle_missing_values_matches(matches)

# Fix Date and separate Season
matches['Date'] = pd.to_datetime(matches['Date'], errors='coerce')
matches['Season'] = matches['Date'].dt.year
matches['Date'] = matches['Date'].dt.date

print("Matches data cleaned.")
matches.info()

Matches data cleaned.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1095 non-null   int64  
 1   Season           1095 non-null   int32  
 2   City             1095 non-null   object 
 3   Date             1095 non-null   object 
 4   Match_Type       1095 non-null   object 
 5   Player_Of_Match  1095 non-null   object 
 6   Venue            1095 non-null   object 
 7   Team1            1095 non-null   object 
 8   Team2            1095 non-null   object 
 9   Toss_Winner      1095 non-null   object 
 10  Toss_Decision    1095 non-null   object 
 11  Winner           1095 non-null   object 
 12  Result           1095 non-null   object 
 13  Result_Margin    1095 non-null   float64
 14  Target_Runs      1095 non-null   float64
 15  Target_Overs     1095 non-null   float64
 16  Super_Over       1095 non-null   objec

## Process Deliveries Data

In [5]:
deliveries = trim_and_title(deliveries)
deliveries = standardize_teams(deliveries, ['Batting_Team', 'Bowling_Team'])

print("Deliveries data cleaned.")
deliveries.info()

Deliveries data cleaned.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Match_Id          260920 non-null  int64 
 1   Inning            260920 non-null  int64 
 2   Batting_Team      260920 non-null  object
 3   Bowling_Team      260920 non-null  object
 4   Over              260920 non-null  int64 
 5   Ball              260920 non-null  int64 
 6   Batter            260920 non-null  object
 7   Bowler            260920 non-null  object
 8   Non_Striker       260920 non-null  object
 9   Batsman_Runs      260920 non-null  int64 
 10  Extra_Runs        260920 non-null  int64 
 11  Total_Runs        260920 non-null  int64 
 12  Extras_Type       14125 non-null   object
 13  Is_Wicket         260920 non-null  int64 
 14  Player_Dismissed  12950 non-null   object
 15  Dismissal_Kind    12950 non-null   object
 16  Fielder      

## Save to Parquet

In [6]:
# Using 'fastparquet' engine explicitly if needed, but 'pyarrow' should work now with clean types
try:
    matches.to_parquet('../data/processed/matches.parquet', index=False)
    deliveries.to_parquet('../data/processed/deliveries.parquet', index=False)
    print("Data saved to parquet in ../data/processed/")
except Exception as e:
    print(f"Error saving parquet: {e}")

Data saved to parquet in ../data/processed/
