# Olympic Medal Prediction - Model Development and Testing

## 1. Setup and Load Data

In [8]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, accuracy_score, f1_score
)
from sklearn.pipeline import Pipeline

import joblib
from datetime import datetime
import pickle

try:
    X = pd.read_pickle('features_X.pkl')
    y = pd.read_pickle('target_y.pkl')
    with open('feature_names.pkl', 'rb') as f:
        available_features = pickle.load(f)
    
    print(f"Data loaded successfully:")
    print(f"Feature matrix shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Medal rate: {y.mean():.1%}")
    
except FileNotFoundError:
    print("Please run the EDA and feature engineering notebook first.")
    raise

Data loaded successfully:
Feature matrix shape: (21398, 16)
Target shape: (21398,)
Medal rate: 10.8%


## 2. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"\nTraining set target distribution:")
train_dist = y_train.value_counts(normalise=True)
print(f"   No Medal: {train_dist[0]:.1%}")
print(f"   Medal: {train_dist[1]:.1%}")

Training set: 17,118 samples
Test set: 4,280 samples

Training set target distribution:
   No Medal: 89.2%
   Medal: 10.8%


## 3. Baseline Model Training

In [None]:
print("Baseline Gradient Boosting Model:")

baseline_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingClassifier(
        random_state=42,
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3
    ))
])

baseline_pipeline.fit(X_train, y_train)

cv_scores = cross_val_score(
    baseline_pipeline, X_train, y_train, 
    cv=5, scoring='roc_auc'
)

print(f"CV AUC: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

Baseline Gradient Boosting Model:
CV AUC: 0.7506 (±0.0101)


## 4. Hyperparameter Optimisation

In [None]:
print("Hyperparameter Optimisation:")

param_grid = {
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.05, 0.1, 0.15],
    'gb__max_depth': [3, 4, 5],
    'gb__min_samples_split': [10, 20]
}

grid_search = GridSearchCV(
    baseline_pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best CV Score: {grid_search.best_score_:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

best_model = grid_search.best_estimator_

Hyperparameter Optimization:
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best CV Score: 0.7578
Best Parameters: {'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__min_samples_split': 10, 'gb__n_estimators': 100}


## 5. Model Evaluation

In [12]:
y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_pred_proba)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"FINAL TEST RESULTS:")
print(f"   AUC-ROC:  {test_auc:.4f}")
print(f"   Accuracy: {test_accuracy:.4f}")
print(f"   F1-Score: {test_f1:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=['No Medal', 'Medal']))

FINAL TEST RESULTS:
   AUC-ROC:  0.7646
   Accuracy: 0.8963
   F1-Score: 0.1591

Classification Report:
              precision    recall  f1-score   support

    No Medal       0.90      0.99      0.94      3818
       Medal       0.64      0.09      0.16       462

    accuracy                           0.90      4280
   macro avg       0.77      0.54      0.55      4280
weighted avg       0.87      0.90      0.86      4280



## 6. Feature Importance Analysis

In [13]:
feature_importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': best_model['gb'].feature_importances_
}).sort_values('importance', ascending=False)

print("10 most important features:")
print("=" * 45)
for i, (_, row) in enumerate(feature_importance_df.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<25}: {row['importance']:.4f}")

feature_categories = {
    'Physical': ['height', 'weight', 'bmi'],
    'Health': ['bodyFat', 'heartRateVariability', 'vo2Max', 'bloodOxygen', 'injurySeverityScore'],
    'Derived': ['fitness_score', 'risk_score', 'estimated_age'],
    'Economic': ['gdp_per_capita'],
    'Performance': ['country_medal_rate', 'country_avg_ranking', 'sport_medal_rate', 'sport_avg_ranking']
}


10 most important features:
 1. country_medal_rate       : 0.2557
 2. sport_medal_rate         : 0.1088
 3. fitness_score            : 0.0619
 4. sport_avg_ranking        : 0.0573
 5. bodyFat                  : 0.0572
 6. bmi                      : 0.0542
 7. vo2Max                   : 0.0520
 8. risk_score               : 0.0513
 9. bloodOxygen              : 0.0503
10. heartRateVariability     : 0.0496


## 7. Save Model and Results

In [14]:
model_metadata = {
    'model_name': 'Olympic Medal Predictor - Gradient Boosting',
    'model_type': 'GradientBoostingClassifier',
    'test_auc': test_auc,
    'test_accuracy': test_accuracy,
    'test_f1': test_f1,
    'cv_score': grid_search.best_score_,
    'best_params': grid_search.best_params_,
    'feature_names': available_features,
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

joblib.dump(best_model, 'olympic_medal_predictor_gb.pkl')
joblib.dump(model_metadata, 'model_metadata.pkl')

evaluation_results = {
    'y_test': y_test,
    'y_test_pred': y_test_pred,
    'y_test_pred_proba': y_test_pred_proba,
    'feature_importance_df': feature_importance_df,
    'best_model': best_model,
    'grid_search': grid_search
}

with open('evaluation_results.pkl', 'wb') as f:
    pickle.dump(evaluation_results, f)