In [5]:
# ============================================
# NOTEBOOK 4: FEATURE ENGINEERING
# ============================================

# CELL 1: Import Libraries
import pandas as pd

# CELL 2: Load Cleaned Data
matches_df = pd.read_csv('../data/matches_cleaned.csv')
deliveries_df = pd.read_csv('../data/deliveries_cleaned.csv')

print("="*60)
print("FEATURE ENGINEERING")
print("="*60)

# CELL 3: Team Statistics
print("\nTeam Statistics...")
team_wins = matches_df['winner'].value_counts().reset_index()
team_wins.columns = ['team', 'wins']
team_wins.to_csv('../outputs/team_stats.csv', index=False)
print("✓ Team stats saved")

# CELL 4: Batsman Statistics
print("\nBatsman Statistics...")
batsman_col = None
for col in ['batsman', 'batter']:
    if col in deliveries_df.columns:
        batsman_col = col
        break

if batsman_col:
    batsman_stats = deliveries_df.groupby(batsman_col)['batsman_runs'].sum().sort_values(ascending=False)
    batsman_stats.to_csv('../outputs/batsman_stats.csv')
    print("✓ Batsman stats saved")
else:
    print("! Batsman column not found")

# CELL 5: Bowler Statistics
print("\nBowler Statistics...")
bowler_col = 'bowler'

if bowler_col in deliveries_df.columns:
    bowler_stats = deliveries_df.groupby(bowler_col)['total_runs'].sum().sort_values()
    bowler_stats.to_csv('../outputs/bowler_stats.csv')
    print("✓ Bowler stats saved")
else:
    print("! Bowler column not found")

# CELL 6: Create Machine Learning Features
print("\nCreating ML Features...")
matches_ml = matches_df.copy()

# Proper encoding of categorical columns
matches_ml['season_encoded'] = pd.factorize(matches_ml['season'])[0]
matches_ml['venue_encoded'] = pd.factorize(matches_ml['venue'])[0]
matches_ml['toss_decision_encoded'] = matches_ml['toss_decision'].apply(lambda x: 1 if x == 'bat' or x == 'bat first' else 0)
matches_ml['team1_encoded'] = pd.factorize(matches_ml['team1'])[0]
matches_ml['team2_encoded'] = pd.factorize(matches_ml['team2'])[0]
matches_ml['winner_encoded'] = pd.factorize(matches_ml['winner'])[0]

matches_ml.to_csv('../data/matches_features.csv', index=False)
print("✓ Features created and saved with all necessary encoded columns")

print("\n✅ FEATURE ENGINEERING COMPLETED!")


FEATURE ENGINEERING

Team Statistics...
✓ Team stats saved

Batsman Statistics...
✓ Batsman stats saved

Bowler Statistics...
✓ Bowler stats saved

Creating ML Features...
✓ Features created and saved with all necessary encoded columns

✅ FEATURE ENGINEERING COMPLETED!
