# MLB Win Prediction - Data Exploration

This notebook explores the processed MLB dataset to understand patterns and relationships.


In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlb_win_pred.config import get_config
from mlb_win_pred.utils import get_data_path

config = get_config()
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load processed dataset
processed_file = get_data_path(config, "games_processed.csv", subdir="processed")
df = pd.read_csv(processed_file)
df['game_date'] = pd.to_datetime(df['game_date'])

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['game_date'].min()} to {df['game_date'].max()}")
print(f"\nColumns: {list(df.columns)}")


## Basic Statistics


In [None]:
# Win rate overall
win_rate = df['win'].mean()
print(f"Overall win rate: {win_rate:.3f}")
print(f"Total games: {len(df)}")
print(f"Wins: {df['win'].sum()}")
print(f"Losses: {(~df['win'].astype(bool)).sum()}")


In [None]:
# Home vs Away win rates
if 'is_home' in df.columns:
    home_win_rate = df[df['is_home'] == 1]['win'].mean()
    away_win_rate = df[df['is_home'] == 0]['win'].mean()
    
    print(f"Home win rate: {home_win_rate:.3f}")
    print(f"Away win rate: {away_win_rate:.3f}")
    print(f"Home advantage: {home_win_rate - away_win_rate:.3f}")
    
    # Plot
    plt.figure()
    plt.bar(['Home', 'Away'], [home_win_rate, away_win_rate])
    plt.ylabel('Win Rate')
    plt.title('Win Rate: Home vs Away')
    plt.ylim([0, 1])
    plt.tight_layout()
    plt.show()


## Feature Correlations


In [None]:
# Get feature columns
feature_cols = [c for c in df.columns if c.startswith('team_') or c.startswith('opp_') or c in ['is_home', 'month']]

# Calculate correlation with win
correlations = df[feature_cols + ['win']].corr()['win'].sort_values(ascending=False)
correlations = correlations[correlations.index != 'win']

print("Top 10 features correlated with win:")
print(correlations.head(10))
print("\nBottom 10 features correlated with win:")
print(correlations.tail(10))


In [None]:
# Plot correlation heatmap for top features
top_features = correlations.abs().nlargest(15).index.tolist()
corr_matrix = df[top_features + ['win']].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix: Top Features vs Win')
plt.tight_layout()
plt.show()


## Distribution Analysis


In [None]:
# Distribution of runs scored/allowed
if 'runs_scored' in df.columns and 'runs_allowed' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df['runs_scored'], bins=20, alpha=0.7, edgecolor='black')
    axes[0].set_xlabel('Runs Scored')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Runs Scored')
    axes[0].axvline(df['runs_scored'].mean(), color='red', linestyle='--', label=f'Mean: {df["runs_scored"].mean():.2f}')
    axes[0].legend()
    
    axes[1].hist(df['runs_allowed'], bins=20, alpha=0.7, edgecolor='black', color='orange')
    axes[1].set_xlabel('Runs Allowed')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Distribution of Runs Allowed')
    axes[1].axvline(df['runs_allowed'].mean(), color='red', linestyle='--', label=f'Mean: {df["runs_allowed"].mean():.2f}')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()


## Time Series Analysis


In [None]:
# Win rate over time
df['year_month'] = df['game_date'].dt.to_period('M')
monthly_win_rate = df.groupby('year_month')['win'].mean()

plt.figure(figsize=(14, 6))
monthly_win_rate.plot(kind='line', marker='o')
plt.axhline(0.5, color='red', linestyle='--', label='50% Baseline')
plt.xlabel('Month')
plt.ylabel('Win Rate')
plt.title('Monthly Win Rate Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## Team Performance


In [None]:
# Win rate by team
team_win_rates = df.groupby('team')['win'].agg(['mean', 'count']).sort_values('mean', ascending=False)
team_win_rates.columns = ['win_rate', 'games']

print("Top 10 teams by win rate:")
print(team_win_rates.head(10))

# Plot
plt.figure(figsize=(14, 8))
top_teams = team_win_rates.head(15)
plt.barh(range(len(top_teams)), top_teams['win_rate'])
plt.yticks(range(len(top_teams)), top_teams.index)
plt.xlabel('Win Rate')
plt.title('Top 15 Teams by Win Rate')
plt.axvline(0.5, color='red', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
