In [4]:
# 📌 IPL Data Preprocessing Script (2008–2024 → Standardized to 2025 Teams)
import pandas as pd


# 1. Load datasets

matches = pd.read_csv("matches_2008-2024.csv")
deliveries = pd.read_csv("deliveries_2008-2024.csv")

# 2. Handle missing values
matches.fillna({'city': 'Unknown', 'winner': 'No Result', 'player_of_match': 'Unknown'}, inplace=True)
deliveries.fillna(0, inplace=True)

# 3. Remove duplicates
matches.drop_duplicates(inplace=True)
deliveries.drop_duplicates(inplace=True)

# 4. Convert data types
# Date conversion
if 'date' in matches.columns:
    matches['date'] = pd.to_datetime(matches['date'], errors='coerce')

# 5. Standardize Deliveries Columns
rename_map = {
    'batsman': 'batter',
    'batsman_runs': 'batsman_run',
    'extra_runs': 'extras_run',
    'total_runs': 'total_run'
}
deliveries.rename(columns=rename_map, inplace=True)

# Ensure numeric conversion safely
numeric_cols = ['inning', 'over', 'ball', 'batsman_run', 'extras_run', 'total_run']
for col in numeric_cols:
    if col in deliveries.columns:
        deliveries[col] = pd.to_numeric(deliveries[col], errors='coerce')

# 6. Standardize Team Names (old → current IPL teams)
team_name_mapping = {
    "Delhi Daredevils": "Delhi Capitals",
    "Kings XI Punjab": "Punjab Kings",
    "Deccan Chargers": "Sunrisers Hyderabad",
    "Rising Pune Supergiant": "Lucknow Super Giants",
    "Rising Pune Supergiants": "Lucknow Super Giants",
    "Pune Warriors": "Lucknow Super Giants",
    "Gujarat Lions": "Gujarat Titans",
    "Kochi Tuskers Kerala": "Lucknow Super Giants",  # dissolved → mapped
    
    "Chennai Super Kings": "Chennai Super Kings",
    "Mumbai Indians": "Mumbai Indians",
    "Kolkata Knight Riders": "Kolkata Knight Riders",
    "Royal Challengers Bangalore": "Royal Challengers Bangalore",
    "Rajasthan Royals": "Rajasthan Royals",
    "Sunrisers Hyderabad": "Sunrisers Hyderabad",
    "Gujarat Titans": "Gujarat Titans",
    "Lucknow Super Giants": "Lucknow Super Giants"
}

for col in ['team1', 'team2', 'toss_winner', 'winner']:
    if col in matches.columns:
        matches[col] = matches[col].replace(team_name_mapping)

for col in ['batting_team', 'bowling_team']:
    if col in deliveries.columns:
        deliveries[col] = deliveries[col].replace(team_name_mapping)

# 7. Clean Player Names
for col in ['player_of_match']:
    if col in matches.columns:
        matches[col] = matches[col].astype(str).str.strip()

for col in ['batter', 'bowler', 'non_striker']:
    if col in deliveries.columns:
        deliveries[col] = deliveries[col].astype(str).str.strip()

# 8. Add Extra Features
if 'date' in matches.columns:
    matches['year'] = matches['date'].dt.year
else:
    matches['year'] = None

matches['match_result'] = matches['winner'].apply(lambda x: 'No Result' if x == 'No Result' else 'Completed')

if 'is_super_over' in deliveries.columns:
    deliveries['is_super_over'] = deliveries['is_super_over'].apply(lambda x: 1 if x == 1 else 0)
else:
    deliveries['is_super_over'] = 0

if 'batsman_run' in deliveries.columns:
    deliveries['strike_rate'] = deliveries['batsman_run'] * 100
else:
    deliveries['strike_rate'] = 0

if 'total_run' in deliveries.columns:
    deliveries['economy'] = deliveries['total_run'] * 6
else:
    deliveries['economy'] = 0

# 9. Merge datasets
if 'match_id' in deliveries.columns and 'id' in matches.columns:
    ipl = deliveries.merge(matches, left_on='match_id', right_on='id', how='left')
else:
    ipl = deliveries.copy()  # fallback

# 10. Save Preprocessed Data
matches.to_csv("cleaned_matches.csv", index=False)
deliveries.to_csv("cleaned_deliveries.csv", index=False)
ipl.to_csv("merged_ipl_data.csv", index=False)

print("✅ Preprocessing Completed Successfully!")
print("   - cleaned_matches.csv")
print("   - cleaned_deliveries.csv")
print("   - merged_ipl_data.csv")


✅ Preprocessing Completed Successfully!
   - cleaned_matches.csv
   - cleaned_deliveries.csv
   - merged_ipl_data.csv
