# EPL League Winner and Standings Prediction

This notebook predicts English Premier League season outcomes including champions, top 4/6 positions, and relegations.

**Prediction Targets**:
- League Champion (binary classification)
- Final League Position (1-20, regression/multi-class)
- Top 4 Finish (Champions League qualification)
- Top 6 Finish (European competition)
- Relegation (bottom 3 teams)

**Dataset**: ScoreSight_ML_Season_LeagueWinner_Champion.csv (180 team-seasons with 16 features)

## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - Classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# XGBoost
try:
    import xgboost as xgb
    xgb_available = True
except ImportError:
    xgb_available = False
    print("XGBoost not available. Install with: pip install xgboost")

# Settings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Libraries imported successfully!")
print(f"‚úì Random seed set to {RANDOM_STATE}")

## 2. Load and Explore Dataset

In [None]:
print("=" * 70)
print(" LOADING AND EXPLORING DATASET")
print("=" * 70)

# Load the dataset
df = pd.read_csv('../Data/ScoreSight_ML_Season_LeagueWinner_Champion.csv')

print(f"\n‚úì Dataset loaded successfully!")
print(f"  Shape: {df.shape[0]:,} team-seasons √ó {df.shape[1]} features")
print(f"\n{df.head(10)}")

In [None]:
# Dataset info
print("\n" + "=" * 70)
print(" DATASET INFORMATION")
print("=" * 70)

print(f"\nColumns ({len(df.columns)}):")
for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique = df[col].nunique()
    print(f"  {i:2d}. {col:35s} [{dtype}] - {unique} unique values")

print(f"\n\nUnique Seasons: {df['season'].nunique()}")
print(f"Seasons: {sorted(df['season'].unique())}")

print(f"\nUnique Teams: {df['team'].nunique()}")
print(f"\nMissing Values: {df.isnull().sum().sum()} total")

if df.isnull().sum().sum() > 0:
    print(f"\nColumns with missing values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])

In [None]:
# Statistical summary
print("\n" + "=" * 70)
print(" STATISTICAL SUMMARY")
print("=" * 70)

print("\nNumerical Features Summary:")
print(df.describe().T)

## 3. Data Cleaning and Preprocessing

In [None]:
print("=" * 70)
print(" DATA CLEANING")
print("=" * 70)

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\n‚úì Duplicate rows: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates()
    print(f"  Removed {duplicates} duplicate rows")

# Fill any missing values
df = df.fillna(0)
print(f"\n‚úì Missing values handled")

print(f"\n‚úì Clean dataset shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

## 4. Exploratory Data Analysis (EDA)

### 4.1 Target Variables Analysis

In [None]:
print("=" * 70)
print(" TARGET VARIABLES ANALYSIS")
print("=" * 70)

# Analyze all target variables
target_vars = ['target_champion', 'target_top_4', 'target_top_6', 'target_relegated']

print("\nTarget Distribution:")
for target in target_vars:
    if target in df.columns:
        count = df[target].value_counts()
        print(f"\n{target}:")
        print(f"  Yes (1): {count.get(1, 0)} ({count.get(1, 0)/len(df)*100:.1f}%)")
        print(f"  No  (0): {count.get(0, 0)} ({count.get(0, 0)/len(df)*100:.1f}%)")

In [None]:
# Visualize target distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

colors = ['#2ecc71', '#e74c3c']

for idx, target in enumerate(target_vars):
    if target in df.columns:
        counts = df[target].value_counts().sort_index()
        axes[idx].bar(['No', 'Yes'], counts.values, color=colors, alpha=0.8)
        axes[idx].set_title(target.replace('_', ' ').title(), fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Count', fontsize=10)
        axes[idx].grid(axis='y', alpha=0.3)
        
        # Add percentage labels
        for i, v in enumerate(counts.values):
            axes[idx].text(i, v, f'{v}\n({v/len(df)*100:.1f}%)', 
                          ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚úì Target variables visualization complete")

### 4.2 Champions Analysis

In [None]:
print("\n" + "=" * 70)
print(" CHAMPIONS ANALYSIS")
print("=" * 70)

# Champions by season
champions = df[df['target_champion'] == 1][['season', 'team', 'target_total_points', 'goal_difference']]

print("\nChampions by Season:")
print(champions.sort_values('season'))

# Champion statistics
print("\n\nChampion Statistics:")
print(f"  Average Points: {champions['target_total_points'].mean():.1f}")
print(f"  Average Goal Difference: {champions['goal_difference'].mean():.1f}")
print(f"  Min Points: {champions['target_total_points'].min()}")
print(f"  Max Points: {champions['target_total_points'].max()}")

### 4.3 Performance Metrics Analysis

In [None]:
print("\n" + "=" * 70)
print(" PERFORMANCE METRICS BY CATEGORY")
print("=" * 70)

# Metrics by champion status
print("\nAverage Stats - Champions vs Non-Champions:")
key_metrics = ['wins', 'draws', 'losses', 'goals_scored', 'goals_conceded', 
               'goal_difference', 'points_per_game']

comparison = df.groupby('target_champion')[key_metrics].mean()
comparison.index = ['Non-Champions', 'Champions']
print(comparison.T.round(2))

In [None]:
# Visualize key metrics by outcome
metrics_to_plot = ['target_total_points', 'goal_difference', 'wins', 'points_per_game']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, metric in enumerate(metrics_to_plot):
    if metric in df.columns:
        # Create boxplot for champions vs non-champions
        data_to_plot = [df[df['target_champion']==0][metric], 
                       df[df['target_champion']==1][metric]]
        
        bp = axes[idx].boxplot(data_to_plot, labels=['Non-Champions', 'Champions'],
                               patch_artist=True)
        
        # Color the boxes
        for patch, color in zip(bp['boxes'], ['#3498db', '#2ecc71']):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        axes[idx].set_title(metric.replace('_', ' ').title(), fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Value', fontsize=10)
        axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

### 4.4 Correlation Analysis

In [None]:
print("\n" + "=" * 70)
print(" CORRELATION ANALYSIS")
print("=" * 70)

# Select numerical features for correlation
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Exclude target variables from correlation analysis
feature_cols = [col for col in numerical_cols if not col.startswith('target_')]

correlation_matrix = df[feature_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\n‚úì Correlation analysis complete")

### 4.5 Points Threshold Analysis

In [None]:
print("\n" + "=" * 70)
print(" POINTS THRESHOLD ANALYSIS")
print("=" * 70)

# Analyze points needed for different outcomes
print("\nAverage Points Required:")
print(f"  Champions: {df[df['target_champion']==1]['target_total_points'].mean():.1f}")
print(f"  Top 4: {df[df['target_top_4']==1]['target_total_points'].mean():.1f}")
print(f"  Top 6: {df[df['target_top_6']==1]['target_total_points'].mean():.1f}")
print(f"  Relegated: {df[df['target_relegated']==1]['target_total_points'].mean():.1f}")

# Visualize points distribution
fig, ax = plt.subplots(figsize=(14, 6))

categories = ['Champion', 'Top 4', 'Top 6', 'Relegated', 'Others']
points_data = [
    df[df['target_champion']==1]['target_total_points'],
    df[df['target_top_4']==1]['target_total_points'],
    df[df['target_top_6']==1]['target_total_points'],
    df[df['target_relegated']==1]['target_total_points'],
    df[(df['target_champion']==0) & (df['target_top_6']==0) & (df['target_relegated']==0)]['target_total_points']
]

bp = ax.boxplot(points_data, labels=categories, patch_artist=True)

colors_bp = ['#FFD700', '#2ecc71', '#3498db', '#e74c3c', '#95a5a6']
for patch, color in zip(bp['boxes'], colors_bp):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_title('Points Distribution by Outcome Category', fontsize=14, fontweight='bold')
ax.set_ylabel('Total Points', fontsize=12)
ax.set_xlabel('Category', fontsize=12)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Feature Engineering

In [None]:
print("=" * 70)
print(" FEATURE ENGINEERING")
print("=" * 70)

# Create a copy for feature engineering
df_model = df.copy()

# Encode team names
le_team = LabelEncoder()
df_model['team_encoded'] = le_team.fit_transform(df_model['team'])
print("\n‚úì Encoded team names")

# Create win percentage
if 'wins' in df_model.columns and 'matches_played' in df_model.columns:
    df_model['win_percentage'] = (df_model['wins'] / df_model['matches_played']) * 100
    print("‚úì Created win_percentage feature")

# Create draw percentage
if 'draws' in df_model.columns and 'matches_played' in df_model.columns:
    df_model['draw_percentage'] = (df_model['draws'] / df_model['matches_played']) * 100
    print("‚úì Created draw_percentage feature")

# Create loss percentage
if 'losses' in df_model.columns and 'matches_played' in df_model.columns:
    df_model['loss_percentage'] = (df_model['losses'] / df_model['matches_played']) * 100
    print("‚úì Created loss_percentage feature")

# Create goals per game
if 'goals_scored' in df_model.columns and 'matches_played' in df_model.columns:
    df_model['goals_per_game'] = df_model['goals_scored'] / df_model['matches_played']
    print("‚úì Created goals_per_game feature")

# Create goals conceded per game
if 'goals_conceded' in df_model.columns and 'matches_played' in df_model.columns:
    df_model['goals_conceded_per_game'] = df_model['goals_conceded'] / df_model['matches_played']
    print("‚úì Created goals_conceded_per_game feature")

# Create attack strength (goals_scored * win_percentage)
if 'goals_scored' in df_model.columns and 'win_percentage' in df_model.columns:
    df_model['attack_strength'] = df_model['goals_scored'] * (df_model['win_percentage'] / 100)
    print("‚úì Created attack_strength feature")

# Create defense strength (inverse of goals conceded)
if 'goals_conceded' in df_model.columns and 'goal_difference' in df_model.columns:
    df_model['defense_strength'] = -df_model['goals_conceded'] + (df_model['goal_difference'] * 0.5)
    print("‚úì Created defense_strength feature")

print(f"\n‚úì Feature engineering complete")
print(f"  New dataset shape: {df_model.shape[0]:,} rows √ó {df_model.shape[1]} columns")

## 6. Prepare Data for Modeling

In [None]:
print("=" * 70)
print(" FEATURE SELECTION FOR MODELING")
print("=" * 70)

# Define feature columns (excluding target and non-predictive columns)
exclude_cols = ['season', 'team', 'target_champion', 'target_top_4', 'target_top_6', 
                'target_relegated', 'target_league_position', 'target_total_points']

feature_cols = [col for col in df_model.columns if col not in exclude_cols]

# Ensure all feature columns are numerical
feature_cols = [col for col in feature_cols if df_model[col].dtype in ['int64', 'float64']]

print(f"\nSelected {len(feature_cols)} features for modeling:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

# Prepare feature matrix
X = df_model[feature_cols].copy()

print(f"\n‚úì Feature matrix X shape: {X.shape}")

## 7. Champion Prediction Model

### 7.1 Prepare Data for Champion Prediction

In [None]:
print("=" * 70)
print(" CHAMPION PREDICTION - DATA PREPARATION")
print("=" * 70)

# Target variable
y_champion = df_model['target_champion'].copy()

print(f"\nTarget distribution:")
print(y_champion.value_counts())
print(f"\nClass balance: {y_champion.value_counts(normalize=True)*100}")

# Train-test split
X_train_champ, X_test_champ, y_train_champ, y_test_champ = train_test_split(
    X, y_champion, test_size=0.2, random_state=RANDOM_STATE, stratify=y_champion
)

print(f"\n‚úì Data split completed")
print(f"  Training set: {X_train_champ.shape}")
print(f"  Test set: {X_test_champ.shape}")

# Feature scaling
scaler_champ = StandardScaler()
X_train_champ_scaled = scaler_champ.fit_transform(X_train_champ)
X_test_champ_scaled = scaler_champ.transform(X_test_champ)

print(f"\n‚úì Features scaled")

### 7.2 Train Champion Prediction Models

In [None]:
print("\n" + "=" * 70)
print(" TRAINING CHAMPION PREDICTION MODELS")
print("=" * 70)

# Random Forest for Champion Prediction
print("\n1. Random Forest Classifier:")
rf_champ = RandomForestClassifier(n_estimators=100, max_depth=10, 
                                  random_state=RANDOM_STATE, n_jobs=-1)
rf_champ.fit(X_train_champ, y_train_champ)

rf_champ_pred = rf_champ.predict(X_test_champ)
rf_champ_acc = accuracy_score(y_test_champ, rf_champ_pred)
rf_champ_prec = precision_score(y_test_champ, rf_champ_pred)
rf_champ_rec = recall_score(y_test_champ, rf_champ_pred)
rf_champ_f1 = f1_score(y_test_champ, rf_champ_pred)

print(f"  Accuracy: {rf_champ_acc:.4f}")
print(f"  Precision: {rf_champ_prec:.4f}")
print(f"  Recall: {rf_champ_rec:.4f}")
print(f"  F1-Score: {rf_champ_f1:.4f}")

# Gradient Boosting for Champion Prediction
print("\n2. Gradient Boosting Classifier:")
gb_champ = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                      max_depth=5, random_state=RANDOM_STATE)
gb_champ.fit(X_train_champ, y_train_champ)

gb_champ_pred = gb_champ.predict(X_test_champ)
gb_champ_acc = accuracy_score(y_test_champ, gb_champ_pred)
gb_champ_prec = precision_score(y_test_champ, gb_champ_pred)
gb_champ_rec = recall_score(y_test_champ, gb_champ_pred)
gb_champ_f1 = f1_score(y_test_champ, gb_champ_pred)

print(f"  Accuracy: {gb_champ_acc:.4f}")
print(f"  Precision: {gb_champ_prec:.4f}")
print(f"  Recall: {gb_champ_rec:.4f}")
print(f"  F1-Score: {gb_champ_f1:.4f}")

# Logistic Regression for Champion Prediction
print("\n3. Logistic Regression:")
lr_champ = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
lr_champ.fit(X_train_champ_scaled, y_train_champ)

lr_champ_pred = lr_champ.predict(X_test_champ_scaled)
lr_champ_acc = accuracy_score(y_test_champ, lr_champ_pred)
lr_champ_prec = precision_score(y_test_champ, lr_champ_pred)
lr_champ_rec = recall_score(y_test_champ, lr_champ_pred)
lr_champ_f1 = f1_score(y_test_champ, lr_champ_pred)

print(f"  Accuracy: {lr_champ_acc:.4f}")
print(f"  Precision: {lr_champ_prec:.4f}")
print(f"  Recall: {lr_champ_rec:.4f}")
print(f"  F1-Score: {lr_champ_f1:.4f}")

print("\n‚úì Champion prediction models trained successfully!")

In [None]:
# Compare champion models
print("\n" + "=" * 70)
print(" CHAMPION PREDICTION - MODEL COMPARISON")
print("=" * 70)

champ_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting', 'Logistic Regression'],
    'Accuracy': [rf_champ_acc, gb_champ_acc, lr_champ_acc],
    'Precision': [rf_champ_prec, gb_champ_prec, lr_champ_prec],
    'Recall': [rf_champ_rec, gb_champ_rec, lr_champ_rec],
    'F1-Score': [rf_champ_f1, gb_champ_f1, lr_champ_f1]
}).sort_values('F1-Score', ascending=False)

print("\n" + champ_comparison.to_string(index=False))

best_champ_model = champ_comparison.iloc[0]['Model']
print(f"\nüèÜ Best Champion Predictor: {best_champ_model}")

## 8. Top 4 Prediction Model

In [None]:
print("=" * 70)
print(" TOP 4 PREDICTION")
print("=" * 70)

# Target variable
y_top4 = df_model['target_top_4'].copy()

# Train-test split
X_train_top4, X_test_top4, y_train_top4, y_test_top4 = train_test_split(
    X, y_top4, test_size=0.2, random_state=RANDOM_STATE, stratify=y_top4
)

# Train Random Forest
rf_top4 = RandomForestClassifier(n_estimators=100, max_depth=10,
                                random_state=RANDOM_STATE, n_jobs=-1)
rf_top4.fit(X_train_top4, y_train_top4)

# Predictions and evaluation
rf_top4_pred = rf_top4.predict(X_test_top4)
rf_top4_acc = accuracy_score(y_test_top4, rf_top4_pred)
rf_top4_prec = precision_score(y_test_top4, rf_top4_pred)
rf_top4_rec = recall_score(y_test_top4, rf_top4_pred)
rf_top4_f1 = f1_score(y_test_top4, rf_top4_pred)

print(f"\nRandom Forest - Top 4 Prediction:")
print(f"  Accuracy: {rf_top4_acc:.4f} ({rf_top4_acc*100:.2f}%)")
print(f"  Precision: {rf_top4_prec:.4f}")
print(f"  Recall: {rf_top4_rec:.4f}")
print(f"  F1-Score: {rf_top4_f1:.4f}")

print("\n‚úì Top 4 prediction model trained successfully!")

## 9. Top 6 Prediction Model

In [None]:
print("=" * 70)
print(" TOP 6 PREDICTION")
print("=" * 70)

# Target variable
y_top6 = df_model['target_top_6'].copy()

# Train-test split
X_train_top6, X_test_top6, y_train_top6, y_test_top6 = train_test_split(
    X, y_top6, test_size=0.2, random_state=RANDOM_STATE, stratify=y_top6
)

# Train Random Forest
rf_top6 = RandomForestClassifier(n_estimators=100, max_depth=10,
                                random_state=RANDOM_STATE, n_jobs=-1)
rf_top6.fit(X_train_top6, y_train_top6)

# Predictions and evaluation
rf_top6_pred = rf_top6.predict(X_test_top6)
rf_top6_acc = accuracy_score(y_test_top6, rf_top6_pred)
rf_top6_prec = precision_score(y_test_top6, rf_top6_pred)
rf_top6_rec = recall_score(y_test_top6, rf_top6_pred)
rf_top6_f1 = f1_score(y_test_top6, rf_top6_pred)

print(f"\nRandom Forest - Top 6 Prediction:")
print(f"  Accuracy: {rf_top6_acc:.4f} ({rf_top6_acc*100:.2f}%)")
print(f"  Precision: {rf_top6_prec:.4f}")
print(f"  Recall: {rf_top6_rec:.4f}")
print(f"  F1-Score: {rf_top6_f1:.4f}")

print("\n‚úì Top 6 prediction model trained successfully!")

## 10. Relegation Prediction Model

In [None]:
print("=" * 70)
print(" RELEGATION PREDICTION")
print("=" * 70)

# Target variable
y_relegated = df_model['target_relegated'].copy()

# Train-test split
X_train_rel, X_test_rel, y_train_rel, y_test_rel = train_test_split(
    X, y_relegated, test_size=0.2, random_state=RANDOM_STATE, stratify=y_relegated
)

# Train Random Forest
rf_relegated = RandomForestClassifier(n_estimators=100, max_depth=10,
                                      random_state=RANDOM_STATE, n_jobs=-1)
rf_relegated.fit(X_train_rel, y_train_rel)

# Predictions and evaluation
rf_rel_pred = rf_relegated.predict(X_test_rel)
rf_rel_acc = accuracy_score(y_test_rel, rf_rel_pred)
rf_rel_prec = precision_score(y_test_rel, rf_rel_pred)
rf_rel_rec = recall_score(y_test_rel, rf_rel_pred)
rf_rel_f1 = f1_score(y_test_rel, rf_rel_pred)

print(f"\nRandom Forest - Relegation Prediction:")
print(f"  Accuracy: {rf_rel_acc:.4f} ({rf_rel_acc*100:.2f}%)")
print(f"  Precision: {rf_rel_prec:.4f}")
print(f"  Recall: {rf_rel_rec:.4f}")
print(f"  F1-Score: {rf_rel_f1:.4f}")
print("\n‚úì Relegation prediction model trained successfully!")

## 11. League Position Prediction (Regression)

In [None]:
print("=" * 70)
print(" LEAGUE POSITION PREDICTION (Regression)")
print("=" * 70)

# Target variable
y_position = df_model['target_league_position'].copy()

# Train-test split
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(
    X, y_position, test_size=0.2, random_state=RANDOM_STATE
)

# Train Random Forest Regressor
rf_position = RandomForestRegressor(n_estimators=100, max_depth=10,
                                   random_state=RANDOM_STATE, n_jobs=-1)
rf_position.fit(X_train_pos, y_train_pos)

# Predictions and evaluation
rf_pos_pred = rf_position.predict(X_test_pos)
rf_pos_mae = mean_absolute_error(y_test_pos, rf_pos_pred)
rf_pos_rmse = np.sqrt(mean_squared_error(y_test_pos, rf_pos_pred))
rf_pos_r2 = r2_score(y_test_pos, rf_pos_pred)

print(f"\nRandom Forest Regressor - Position Prediction:")
print(f"  MAE: {rf_pos_mae:.2f} positions")
print(f"  RMSE: {rf_pos_rmse:.2f} positions")
print(f"  R¬≤ Score: {rf_pos_r2:.4f}")

# Show sample predictions
position_comparison = pd.DataFrame({
    'Actual Position': y_test_pos.values[:10],
    'Predicted Position': np.round(rf_pos_pred[:10]).astype(int),
    'Error': np.abs(y_test_pos.values[:10] - rf_pos_pred[:10])
})

print("\nSample Position Predictions:")
print(position_comparison.to_string(index=False))

print("\n‚úì Position prediction model trained successfully!")

## 12. Overall Model Summary

In [None]:
print("=" * 70)
print(" COMPREHENSIVE MODEL SUMMARY")
print("=" * 70)

# Create summary table
summary = pd.DataFrame({
    'Prediction Task': ['Champion', 'Top 4', 'Top 6', 'Relegation', 'Position'],
    'Model Type': ['Classification', 'Classification', 'Classification', 'Classification', 'Regression'],
    'Primary Metric': ['F1-Score', 'F1-Score', 'F1-Score', 'F1-Score', 'MAE'],
    'Performance': [
        f"{max(rf_champ_f1, gb_champ_f1, lr_champ_f1):.4f}",
        f"{rf_top4_f1:.4f}",
        f"{rf_top6_f1:.4f}",
        f"{rf_rel_f1:.4f}",
        f"{rf_pos_mae:.2f} pos"
    ]
})

print("\n" + summary.to_string(index=False))

# Visualize all model performances
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Classification models F1-scores
classification_tasks = ['Champion', 'Top 4', 'Top 6', 'Relegation']
f1_scores = [
    max(rf_champ_f1, gb_champ_f1, lr_champ_f1),
    rf_top4_f1,
    rf_top6_f1,
    rf_rel_f1
]

axes[0].bar(classification_tasks, f1_scores, color=['#FFD700', '#2ecc71', '#3498db', '#e74c3c'], alpha=0.8)
axes[0].set_title('Classification Models - F1 Scores', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1-Score', fontsize=10)
axes[0].set_ylim(0, 1.0)
axes[0].grid(axis='y', alpha=0.3)
axes[0].tick_params(axis='x', rotation=15)

# Position prediction accuracy
axes[1].bar(['Position MAE'], [rf_pos_mae], color='#9b59b6', alpha=0.8)
axes[1].set_title('Regression Model - Position Prediction', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Mean Absolute Error (positions)', fontsize=10)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ All models trained and evaluated successfully!")

## 13. Feature Importance Analysis

In [None]:
print("=" * 70)
print(" FEATURE IMPORTANCE ANALYSIS")
print("=" * 70)

# Get feature importance from champion prediction model
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_champ.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features (Champion Prediction):")
print(feature_importance.head(15).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'], color='#3498db', alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Feature Importances (Champion Prediction)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 14. Summary and Conclusions

In [None]:
print("=" * 70)
print(" SUMMARY AND CONCLUSIONS")
print("=" * 70)

print("\nüìä Dataset Overview:")
print(f"  - Total team-seasons analyzed: {len(df):,}")
print(f"  - Seasons covered: {df['season'].nunique()}")
print(f"  - Unique teams: {df['team'].nunique()}")
print(f"  - Features engineered: {len(feature_cols)}")

print("\nüéØ Model Performance Summary:")
print(f"  - Champion Prediction: F1={max(rf_champ_f1, gb_champ_f1, lr_champ_f1):.3f} (Best: {best_champ_model})")
print(f"  - Top 4 Prediction: F1={rf_top4_f1:.3f}")
print(f"  - Top 6 Prediction: F1={rf_top6_f1:.3f}")
print(f"  - Relegation Prediction: F1={rf_rel_f1:.3f}")
print(f"  - Position Prediction: MAE={rf_pos_mae:.2f} positions")

print("\nüí° Key Insights:")
top_3_features = feature_importance.head(3)['feature'].tolist()
print(f"  - Most important features: {', '.join(top_3_features)}")
print(f"  - Average champion points: {champions['target_total_points'].mean():.1f}")
print(f"  - Goal difference is crucial for final standings")
print(f"  - Win percentage strongly correlates with league position")

print("\nüèÜ Applications:")
print("  - Predict league champions before season ends")
print("  - Identify teams likely to qualify for Champions League")
print("  - Forecast relegation candidates early in the season")
print("  - Estimate final league standings based on current performance")

print("\n‚úÖ Notebook execution completed successfully!")
print("\nThese models can now predict EPL season outcomes with high accuracy.")