In [1]:
"""
MODEL TRAINING - Soccer Match Predictor
Goal: Train ML models and BEAT 72% ACCURACY!
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier




import warnings
warnings.filterwarnings('ignore')

# Style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("ü§ñ MODEL TRAINING ")
print("="*60)

ü§ñ MODEL TRAINING 


In [2]:
# ============================================
# LOAD PROCESSED DATA
# ============================================

print("üìÇ Loading processed data with features...\n")

df = pd.read_csv('../data/processed/matches_with_features.csv')
print(df.columns.tolist())


print(f"‚úÖ Loaded {len(df)} matches")
print(f"üìä Shape: {df.shape}")
print(f"\nüëÄ Columns:")
print(df.columns.tolist())

print(f"\nüéØ Target distribution:")
print(df['result'].value_counts())
print(f"\nPercentages:")
print((df['result'].value_counts() / len(df) * 100).round(2))

# Quick peek
print(f"\nüìã Sample data:")
print(df.head())

üìÇ Loading processed data with features...

['date', 'home_team', 'away_team', 'home_score', 'away_score', 'matchday', 'result', 'home_form', 'away_form', 'form_diff', 'home_goals_scored_avg', 'home_goals_conceded_avg', 'away_goals_scored_avg', 'away_goals_conceded_avg', 'offensive_strength_diff', 'defensive_strength_diff', 'home_team_value', 'away_team_value', 'value_diff', 'home_win_rate', 'away_win_rate', 'win_rate_diff', 'home_points_match', 'away_points_match', 'home_points_cum', 'away_points_cum', 'home_goal_diff_match', 'away_goal_diff_match', 'home_goal_diff_cum', 'away_goal_diff_cum', 'home_recent_points5', 'away_recent_points5', 'rest_days_home', 'rest_days_away', 'home_big6', 'away_big6']
‚úÖ Loaded 760 matches
üìä Shape: (760, 36)

üëÄ Columns:
['date', 'home_team', 'away_team', 'home_score', 'away_score', 'matchday', 'result', 'home_form', 'away_form', 'form_diff', 'home_goals_scored_avg', 'home_goals_conceded_avg', 'away_goals_scored_avg', 'away_goals_conceded_avg', '

In [3]:
# ============================================
# DATA PREPARATION FOR ML - ALL FEATURES
# ============================================

print("üîß Preparing data with ALL FEATURES...\n")

# 1. D√©finir ce qu'on EXCLUT
metadata_cols = [
    'date', 'home_team', 'away_team',
    'home_score', 'away_score', 'matchday', 'result'
]

# Features d√©riv√©es du score (data leakage!)
score_derived = [
    'home_points_match', 'away_points_match',
    'home_goal_diff_match', 'away_goal_diff_match'
]

# 2. Toutes les autres colonnes = features
exclude_cols = metadata_cols + score_derived
feature_columns = [c for c in df.columns if c not in exclude_cols]

print(f"üìä Using {len(feature_columns)} features:")
for i, f in enumerate(feature_columns, 1):
    print(f"  {i}. {f}")

# 3. Pr√©parer X et y
X = df[feature_columns].copy()
y = df['result'].copy()

# 4. G√©rer les NaN
print(f"\nüîç Missing values: {X.isna().sum().sum()} total")
X = X.fillna(0)
print(f"‚úÖ After fillna: {X.isna().sum().sum()} missing")

print(f"\n‚úÖ Features (X): {X.shape}")
print(f"‚úÖ Target (y): {y.shape}")

# 5. Encoder le target (H/A/D ‚Üí 0/1/2)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nüî§ Target encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label} ‚Üí {i}")

# 6. Train/Test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded
)

print(f"\nüì¶ Train set: {X_train.shape[0]} matches ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"üì¶ Test set: {X_test.shape[0]} matches ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\n‚úÖ DATA READY FOR TRAINING!")
print(f"üéØ Baseline to beat: {max(y.value_counts())/len(y)*100:.2f}%")

# 7. Sauvegarder labels pour plus tard
labels = label_encoder.classes_

üîß Preparing data with ALL FEATURES...

üìä Using 25 features:
  1. home_form
  2. away_form
  3. form_diff
  4. home_goals_scored_avg
  5. home_goals_conceded_avg
  6. away_goals_scored_avg
  7. away_goals_conceded_avg
  8. offensive_strength_diff
  9. defensive_strength_diff
  10. home_team_value
  11. away_team_value
  12. value_diff
  13. home_win_rate
  14. away_win_rate
  15. win_rate_diff
  16. home_points_cum
  17. away_points_cum
  18. home_goal_diff_cum
  19. away_goal_diff_cum
  20. home_recent_points5
  21. away_recent_points5
  22. rest_days_home
  23. rest_days_away
  24. home_big6
  25. away_big6

üîç Missing values: 2 total
‚úÖ After fillna: 0 missing

‚úÖ Features (X): (760, 25)
‚úÖ Target (y): (760,)

üî§ Target encoding:
  A ‚Üí 0
  D ‚Üí 1
  H ‚Üí 2

üì¶ Train set: 608 matches (80.0%)
üì¶ Test set: 152 matches (20.0%)

‚úÖ DATA READY FOR TRAINING!
üéØ Baseline to beat: 43.42%


In [4]:
# ============================================
# MODEL 1: LOGISTIC REGRESSION (BASELINE ML)
# ============================================

print("ü§ñ Training Model 1: Logistic Regression")
print("="*60)
print("This is our baseline ML model - simplest algorithm\n")

# Train
lr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced',
        multi_class='multinomial'
    ))
])
lr_model.fit(X_train, y_train)

# Predict
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

# Evaluate
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"üìä RESULTS:")
print(f"  Training Accuracy: {train_acc*100:.2f}%")
print(f"  Test Accuracy: {test_acc*100:.2f}%")
print(f"\nüéØ vs Baseline (43.42%): {test_acc*100 - 43.42:+.2f} points")

if test_acc > 0.7:
    print(f"\nüî•üî•üî• HOLY SHIT! WE BEAT 70%!! üî•üî•üî•")
elif test_acc > 0.6:
    print(f"\nüî• NICE! Above 60%! We're getting there!")
elif test_acc > 0.5:
    print(f"\n‚úÖ Good! Better than random (50%)")
else:
    print(f"\n‚ö†Ô∏è  Needs improvement, but better than baseline!")

# Confusion Matrix
print(f"\nüìã CONFUSION MATRIX:")
cm = confusion_matrix(y_test, y_pred_test)
labels = label_encoder.classes_

# Pretty print
print("\nActual ‚Üí")
print("Predicted ‚Üì")
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print(cm_df)

# Detailed report
print(f"\nüìä DETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred_test, target_names=labels))

ü§ñ Training Model 1: Logistic Regression
This is our baseline ML model - simplest algorithm

üìä RESULTS:
  Training Accuracy: 53.45%
  Test Accuracy: 46.05%

üéØ vs Baseline (43.42%): +2.63 points

‚ö†Ô∏è  Needs improvement, but better than baseline!

üìã CONFUSION MATRIX:

Actual ‚Üí
Predicted ‚Üì
    A   D   H
A  29  12  10
D  14   9  12
H  21  13  32

üìä DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           A       0.45      0.57      0.50        51
           D       0.26      0.26      0.26        35
           H       0.59      0.48      0.53        66

    accuracy                           0.46       152
   macro avg       0.44      0.44      0.43       152
weighted avg       0.47      0.46      0.46       152



In [5]:
# ============================================
# MODEL 2: RANDOM FOREST (PLUS PUISSANT!)
# ============================================

print("üå≥ Training Model 2: Random Forest")
print("="*60)
print("More powerful than Logistic Regression - should handle")
print("the class imbalance better!\n")

# Train avec class_weight pour g√©rer le d√©s√©quilibre
rf_model = RandomForestClassifier(
    n_estimators=400,        # plus d'arbres = plus stable
    max_depth=8,             # moins profond = moins d'overfit
    min_samples_leaf=2,      # chaque feuille a au moins 4 matchs
    max_features='sqrt',     # classique en RF
    class_weight='balanced',
    random_state=42
)

rf_model.fit(X_train, y_train)

# Predict
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Evaluate
train_acc_rf = accuracy_score(y_train, y_pred_train_rf)
test_acc_rf = accuracy_score(y_test, y_pred_test_rf)

print(f"üìä RESULTS:")
print(f"  Training Accuracy: {train_acc_rf*100:.2f}%")
print(f"  Test Accuracy: {test_acc_rf*100:.2f}%")
print(f"\nüéØ vs Baseline (43.42%): {test_acc_rf*100 - 43.42:+.2f} points")
print(f"üÜö vs Logistic Regression: {test_acc_rf*100 - test_acc*100:+.2f} points")

if test_acc_rf > 0.72:
    print(f"\nüî•üî•üî• WE DID IT!! 72%+ ACHIEVED!! üî•üî•üî•")
elif test_acc_rf > 0.65:
    print(f"\nüî• GREAT! We're getting close to 72%!")
elif test_acc_rf > 0.6:
    print(f"\n‚úÖ Good improvement! Above 60%!")
else:
    print(f"\nüìà Better, but still work to do!")

# Confusion Matrix
print(f"\nüìã CONFUSION MATRIX:")
cm_rf = confusion_matrix(y_test, y_pred_test_rf)
cm_rf_df = pd.DataFrame(cm_rf, index=labels, columns=labels)
print(cm_rf_df)

# Classification Report
print(f"\nüìä DETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred_test_rf, target_names=labels))

# Feature Importance
print(f"\nüéØ FEATURE IMPORTANCE:")
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.to_string(index=False))

üå≥ Training Model 2: Random Forest
More powerful than Logistic Regression - should handle
the class imbalance better!

üìä RESULTS:
  Training Accuracy: 95.72%
  Test Accuracy: 51.32%

üéØ vs Baseline (43.42%): +7.90 points
üÜö vs Logistic Regression: +5.26 points

üìà Better, but still work to do!

üìã CONFUSION MATRIX:
    A  D   H
A  33  6  12
D  14  6  15
H  18  9  39

üìä DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           A       0.51      0.65      0.57        51
           D       0.29      0.17      0.21        35
           H       0.59      0.59      0.59        66

    accuracy                           0.51       152
   macro avg       0.46      0.47      0.46       152
weighted avg       0.49      0.51      0.50       152


üéØ FEATURE IMPORTANCE:
                feature  importance
             value_diff    0.110624
        home_team_value    0.063178
          win_rate_diff    0.053534
        home_points_cum    0.

In [6]:
import pandas as pd  # si c'est d√©j√† import√© en haut, c'est pas grave

feature_importances = pd.Series(
    rf_model.feature_importances_,
    index=feature_columns
).sort_values(ascending=False)

print("\nüèÖ Top Feature Importances (Random Forest):")
print(feature_importances.head(10))



üèÖ Top Feature Importances (Random Forest):
value_diff            0.110624
home_team_value       0.063178
win_rate_diff         0.053534
home_points_cum       0.049033
home_goal_diff_cum    0.048136
away_team_value       0.046268
away_points_cum       0.046000
home_win_rate         0.045134
form_diff             0.044725
rest_days_home        0.043615
dtype: float64


In [7]:
# ============================================
# MODEL 3: XGBOOST (BALANCED)
# ============================================

print("\nüî• Training Model 3: XGBoost (Balanced)")
print("============================================================")

xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(np.unique(y_train)),
    eval_metric='mlogloss',

    # Sweet spot entre capacit√© et r√©gularisation
    n_estimators=200,        # Keep original
    learning_rate=0.08,      # Entre 0.05 et 0.1
    max_depth=4,             # Un peu plus (3‚Üí4)

    # R√©gularisation mod√©r√©e
    min_child_weight=6,      # Entre 5 et 8
    subsample=0.75,          # Entre 0.7 et 0.8
    colsample_bytree=0.75,   # Entre 0.7 et 0.8
    gamma=0.5,               # Mod√©r√© (pas 1.0)
    reg_lambda=2.5,          # Entre 2 et 3
    reg_alpha=1.2,           # Entre 1 et 1.5

    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

print("Training with balanced hyperparameters...")
xgb_model.fit(X_train, y_train)

y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb  = xgb_model.predict(X_test)

train_acc_xgb = accuracy_score(y_train, y_train_pred_xgb) * 100
test_acc_xgb  = accuracy_score(y_test, y_test_pred_xgb) * 100

print("\nüìä RESULTS (XGBoost Balanced):")
print(f"  Training Accuracy: {train_acc_xgb:.2f}%")
print(f"  Test Accuracy:     {test_acc_xgb:.2f}%")
print(f"  Overfit Gap:       {train_acc_xgb - test_acc_xgb:.2f}%")

print(f"\nüéØ vs Baseline (43.42%): {test_acc_xgb - 43.42:+.2f} points")
print(f"üÜö vs Original XGB (53.29%): {test_acc_xgb - 53.29:+.2f} points")

if test_acc_xgb > 53.29:
    print(f"\nüî• IMPROVED! New best: {test_acc_xgb:.2f}%")
elif test_acc_xgb > 52.5:
    print(f"\n‚úÖ Close to original, similar performance")
else:
    print(f"\n‚ö†Ô∏è  Original was better at 53.29%")

# Confusion Matrix
print(f"\nüìã CONFUSION MATRIX:")
cm_xgb = confusion_matrix(y_test, y_test_pred_xgb)
cm_xgb_df = pd.DataFrame(cm_xgb, index=labels, columns=labels)
print(cm_xgb_df)

print(f"\nüìä CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred_xgb, target_names=labels))

# Feature Importance
print(f"\nüéØ TOP 10 FEATURES:")
feature_imp = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(10)
print(feature_imp.to_string(index=False))


üî• Training Model 3: XGBoost (Balanced)
Training with balanced hyperparameters...

üìä RESULTS (XGBoost Balanced):
  Training Accuracy: 89.14%
  Test Accuracy:     53.29%
  Overfit Gap:       35.86%

üéØ vs Baseline (43.42%): +9.87 points
üÜö vs Original XGB (53.29%): -0.00 points

‚úÖ Close to original, similar performance

üìã CONFUSION MATRIX:
    A  D   H
A  32  5  14
D  14  4  17
H  13  8  45

üìä CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           A       0.54      0.63      0.58        51
           D       0.24      0.11      0.15        35
           H       0.59      0.68      0.63        66

    accuracy                           0.53       152
   macro avg       0.46      0.47      0.46       152
weighted avg       0.49      0.53      0.51       152


üéØ TOP 10 FEATURES:
                feature  importance
             value_diff    0.068478
        home_team_value    0.055144
              form_diff    0.054930
              h