In [None]:
# ============================================================
# TITANIC - MACHINE LEARNING
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# ============================================================
# 1. CHARGEMENT DES DONN√âES ENGINEER√âES
# ============================================================

data = pd.read_csv('data/train_engineered_final.csv')
print(f"‚úÖ Dataset charg√© : {data.shape}")

# ============================================================
# 2. S√âLECTION & PR√âPARATION DES FEATURES
# ============================================================

# √Ä d√©finir ensemble...

# ============================================================
# 3. ENCODAGE DES VARIABLES CAT√âGORIELLES
# ============================================================

# √Ä faire...

# ============================================================
# 4. TRAIN/TEST SPLIT
# ============================================================

# √Ä faire...

# ============================================================
# 5. ENTRA√éNEMENT DES MOD√àLES
# ============================================================

# √Ä tester plusieurs mod√®les...

# ============================================================
# 6. √âVALUATION & COMPARAISON
# ============================================================

# M√©triques de performance...

# ============================================================
# 7. OPTIMISATION DU MEILLEUR MOD√àLE
# ============================================================

# Hyperparameter tuning...

# ============================================================
# 8. SAUVEGARDE DU MOD√àLE FINAL
# ============================================================

# Pickle ou joblib...

‚úÖ Dataset charg√© : (891, 21)


In [2]:
data.info()
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Survived         891 non-null    int64  
 1   Pclass           891 non-null    int64  
 2   Name             891 non-null    object 
 3   Sex              891 non-null    object 
 4   Age              891 non-null    float64
 5   SibSp            891 non-null    int64  
 6   Parch            891 non-null    int64  
 7   Ticket           891 non-null    object 
 8   Fare             891 non-null    float64
 9   Cabin            891 non-null    object 
 10  Embarked         891 non-null    object 
 11  Title            891 non-null    object 
 12  Age_Was_Missing  891 non-null    int64  
 13  Title_Is_Rare    891 non-null    int64  
 14  Title_Simple     891 non-null    object 
 15  FamilySize       891 non-null    int64  
 16  HasFamily        891 non-null    int64  
 17  Large_Family    

In [3]:
features_to_keep = [
    'Survived',           # TARGET
    'Pclass',             # Classe sociale
    'Sex',                # Sexe (impact majeur)
    'Age',                # √Çge num√©rique
    'Fare',               # Prix num√©rique
    'Title_Simple',       # Titre simplifi√©
    'FamilySize',         # Taille famille
    'HasFamily',          # Flag famille
    'Large_Family',       # Flag grande famille
    'Age_Was_Missing'     # Flag age imput√©
]

data2 = data[features_to_keep].copy()

print(f"\nüìä Statistiques rapides :")
print(f"   ‚Ä¢ Variables num√©riques : {data2.select_dtypes(include=[np.number]).shape[1]}")
print(f"   ‚Ä¢ Variables cat√©gorielles : {data2.select_dtypes(include=['object']).shape[1]}")

print("\n" + "="*60)


üìä Statistiques rapides :
   ‚Ä¢ Variables num√©riques : 8
   ‚Ä¢ Variables cat√©gorielles : 2



ENCODAGE
===

In [4]:
# ============================================================
# ENCODAGE DES VARIABLES CAT√âGORIELLES
# ============================================================

# S√©parer features et target
X = data2.drop('Survived', axis=1).copy()
y = data2['Survived'].copy()

print(f"\n‚úÖ X (features) : {X.shape}")
print(f"‚úÖ y (target) : {y.shape}")


‚úÖ X (features) : (891, 9)
‚úÖ y (target) : (891,)


In [7]:
#===========================================================
# ONE-HOT ENCODING DES VARIABLES CAT√âGORIELLES
#===========================================================

X = pd.get_dummies(X, columns=['Sex', 'Title_Simple'], drop_first=True)
print(f"\n‚úÖ Dataset avec One-Hot : {X.shape}")
print(f"   Nouvelles colonnes : {list(X.columns)}")

KeyError: "None of [Index(['Sex', 'Title_Simple'], dtype='object')] are in the [columns]"

In [8]:
# ============================================================
# CR√âATION DE FEATURES D'INTERACTION
# ============================================================

print("="*60)
print("üîó CR√âATION DE FEATURES D'INTERACTION")
print("="*60)

# Version SANS interactions (pour Random Forest, XGBoost)
X_base = X.copy()

# Version AVEC interactions (pour Logistic Regression, SVM)
X_interact = X.copy()

# ============================================================
# Interactions Cl√©s Identifi√©es dans l'EDA
# ============================================================

print("\nüìä Interactions √† cr√©er (bas√©es sur l'EDA) :")

# 1. SEX √ó PCLASS (Impact majeur observ√©)
# Les femmes en 1√®re classe ont ~97% survie vs ~50% en 3√®me
X_interact['Sex_male_x_Pclass'] = X_interact['Sex_male'] * X_interact['Pclass']

print("   ‚úÖ 1. Sex √ó Pclass")

# 2. AGE √ó SEX (Enfants vs Adultes, diff√©rent selon sexe)
X_interact['Age_x_Sex_male'] = X_interact['Age'] * X_interact['Sex_male']

print("   ‚úÖ 2. Age √ó Sex")

# 3. FAMILYSIZE √ó PCLASS (Grandes familles en 3√®me classe = danger)
X_interact['FamilySize_x_Pclass'] = X_interact['FamilySize'] * X_interact['Pclass']

print("   ‚úÖ 3. FamilySize √ó Pclass")

# 4. FARE √ó PCLASS (Nuances au sein d'une classe)
X_interact['Fare_x_Pclass'] = X_interact['Fare'] * X_interact['Pclass']

print("   ‚úÖ 4. Fare √ó Pclass")

# 5. HASFAMILY √ó PCLASS (Famille protectrice sauf en 3√®me classe)
X_interact['HasFamily_x_Pclass'] = X_interact['HasFamily'] * X_interact['Pclass']

print("   ‚úÖ 5. HasFamily √ó Pclass")

# 6. AGE √ó PCLASS (Enfants prioritaires, mais moins en 3√®me)
X_interact['Age_x_Pclass'] = X_interact['Age'] * X_interact['Pclass']

print("   ‚úÖ 6. Age √ó Pclass")

# ============================================================
# V√âRIFICATION
# ============================================================

print(f"\nüìä R√©sultat :")
print(f"   X_base (sans interactions)  : {X_base.shape[1]} features")
print(f"   X_interact (avec interactions) : {X_interact.shape[1]} features")
print(f"   ‚Üí {X_interact.shape[1] - X_base.shape[1]} interactions ajout√©es")

print(f"\nüìã Nouvelles colonnes d'interaction :")
interaction_cols = [col for col in X_interact.columns if '_x_' in col]
for i, col in enumerate(interaction_cols, 1):
    print(f"   {i}. {col}")

print("\n" + "="*60)

üîó CR√âATION DE FEATURES D'INTERACTION

üìä Interactions √† cr√©er (bas√©es sur l'EDA) :
   ‚úÖ 1. Sex √ó Pclass
   ‚úÖ 2. Age √ó Sex
   ‚úÖ 3. FamilySize √ó Pclass
   ‚úÖ 4. Fare √ó Pclass
   ‚úÖ 5. HasFamily √ó Pclass
   ‚úÖ 6. Age √ó Pclass

üìä R√©sultat :
   X_base (sans interactions)  : 13 features
   X_interact (avec interactions) : 19 features
   ‚Üí 6 interactions ajout√©es

üìã Nouvelles colonnes d'interaction :
   1. Sex_male_x_Pclass
   2. Age_x_Sex_male
   3. FamilySize_x_Pclass
   4. Fare_x_Pclass
   5. HasFamily_x_Pclass
   6. Age_x_Pclass



In [10]:
# ============================================================
# TRAIN/TEST SPLIT - VERSION BASE
# ============================================================

X_train_base, X_test_base, y_train, y_test = train_test_split(
    X_base, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("‚úÖ Split BASE cr√©√©")

# ============================================================
# TRAIN/TEST SPLIT - VERSION AVEC INTERACTIONS
# ============================================================

X_train_interact, X_test_interact, _, _ = train_test_split(
    X_interact, y,
    test_size=0.2,
    random_state=42,  # M√äME random_state !
    stratify=y
)

print("‚úÖ Split INTERACTIONS cr√©√©")

# ============================================================
# STANDARDISATION - LES 2 VERSIONS
# ============================================================

from sklearn.preprocessing import StandardScaler

# Colonnes num√©riques √† standardiser
numeric_cols = ['Pclass', 'Age', 'Fare', 'FamilySize']

# BASE
scaler_base = StandardScaler()
X_train_base_scaled = X_train_base.copy()
X_test_base_scaled = X_test_base.copy()
X_train_base_scaled[numeric_cols] = scaler_base.fit_transform(X_train_base[numeric_cols])
X_test_base_scaled[numeric_cols] = scaler_base.transform(X_test_base[numeric_cols])

# INTERACTIONS (scaler sur TOUTES les colonnes num√©riques incluant interactions)
numeric_cols_interact = numeric_cols + interaction_cols
scaler_interact = StandardScaler()
X_train_interact_scaled = X_train_interact.copy()
X_test_interact_scaled = X_test_interact.copy()
X_train_interact_scaled[numeric_cols_interact] = scaler_interact.fit_transform(X_train_interact[numeric_cols_interact])
X_test_interact_scaled[numeric_cols_interact] = scaler_interact.transform(X_test_interact[numeric_cols_interact])

print("‚úÖ Standardisation appliqu√©e aux 2 versions")

print("\n" + "="*60)
print("üì¶ DATASETS PR√äTS POUR LE ML")
print("="*60)
print("\nüéØ Pour mod√®les lin√©aires (Logistic, SVM) :")
print(f"   ‚Üí X_train_interact_scaled, X_test_interact_scaled")
print("\nüå≥ Pour mod√®les d'arbres (Random Forest, XGBoost) :")
print(f"   ‚Üí X_train_base, X_test_base")
print("="*60)

‚úÖ Split BASE cr√©√©
‚úÖ Split INTERACTIONS cr√©√©
‚úÖ Standardisation appliqu√©e aux 2 versions

üì¶ DATASETS PR√äTS POUR LE ML

üéØ Pour mod√®les lin√©aires (Logistic, SVM) :
   ‚Üí X_train_interact_scaled, X_test_interact_scaled

üå≥ Pour mod√®les d'arbres (Random Forest, XGBoost) :
   ‚Üí X_train_base, X_test_base


In [12]:
# ============================================================
# CR√âATION DE FEATURES D'INTERACTION
# ============================================================

print("="*60)
print("üîó CR√âATION DE FEATURES D'INTERACTION")
print("="*60)

# Version SANS interactions (pour Random Forest, XGBoost)
X_base = X.copy()

# Version AVEC interactions (pour Logistic Regression, SVM)
X_interact = X.copy()

# ============================================================
# Interactions Cl√©s (bas√©es sur l'EDA)
# ============================================================

print("\nüìä Cr√©ation des interactions :")

# 1. SEX √ó PCLASS - Impact majeur observ√©
if 'Sex_male' in X_interact.columns:
    X_interact['Sex_male_x_Pclass'] = X_interact['Sex_male'] * X_interact['Pclass']
    print("   ‚úÖ 1. Sex_male √ó Pclass")

# 2. AGE √ó SEX - Enfants vs Adultes diff√©rent selon sexe
if 'Sex_male' in X_interact.columns:
    X_interact['Age_x_Sex_male'] = X_interact['Age'] * X_interact['Sex_male']
    print("   ‚úÖ 2. Age √ó Sex_male")

# 3. FAMILYSIZE √ó PCLASS - Grandes familles en 3√®me = danger
X_interact['FamilySize_x_Pclass'] = X_interact['FamilySize'] * X_interact['Pclass']
print("   ‚úÖ 3. FamilySize √ó Pclass")

# 4. FARE √ó PCLASS - Nuances au sein d'une classe
X_interact['Fare_x_Pclass'] = X_interact['Fare'] * X_interact['Pclass']
print("   ‚úÖ 4. Fare √ó Pclass")

# 5. HASFAMILY √ó PCLASS - Famille protectrice sauf en 3√®me
X_interact['HasFamily_x_Pclass'] = X_interact['HasFamily'] * X_interact['Pclass']
print("   ‚úÖ 5. HasFamily √ó Pclass")

# 6. AGE √ó PCLASS - Enfants prioritaires, moins en 3√®me
X_interact['Age_x_Pclass'] = X_interact['Age'] * X_interact['Pclass']
print("   ‚úÖ 6. Age √ó Pclass")

# Colonnes d'interaction cr√©√©es
interaction_cols = [col for col in X_interact.columns if '_x_' in col]

print(f"\nüìä R√©sultat :")
print(f"   X_base : {X_base.shape[1]} features")
print(f"   X_interact : {X_interact.shape[1]} features (+{len(interaction_cols)} interactions)")

print("\n" + "="*60)

üîó CR√âATION DE FEATURES D'INTERACTION

üìä Cr√©ation des interactions :
   ‚úÖ 1. Sex_male √ó Pclass
   ‚úÖ 2. Age √ó Sex_male
   ‚úÖ 3. FamilySize √ó Pclass
   ‚úÖ 4. Fare √ó Pclass
   ‚úÖ 5. HasFamily √ó Pclass
   ‚úÖ 6. Age √ó Pclass

üìä R√©sultat :
   X_base : 13 features
   X_interact : 19 features (+6 interactions)



In [14]:
# ============================================================
# TRAIN/TEST SPLIT
# ============================================================

from sklearn.model_selection import train_test_split

print("="*60)
print("‚úÇÔ∏è S√âPARATION TRAIN/TEST")
print("="*60)

# VERSION BASE (sans interactions)
X_train_base, X_test_base, y_train, y_test = train_test_split(
    X_base, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nüì¶ VERSION BASE :")
print(f"   Train : {X_train_base.shape}")
print(f"   Test  : {X_test_base.shape}")

# VERSION AVEC INTERACTIONS (M√äME random_state !)
X_train_interact, X_test_interact, _, _ = train_test_split(
    X_interact, y,
    test_size=0.2,
    random_state=42,  # Important : m√™me split !
    stratify=y
)

print(f"\nüì¶ VERSION INTERACTIONS :")
print(f"   Train : {X_train_interact.shape}")
print(f"   Test  : {X_test_interact.shape}")

print(f"\n‚öñÔ∏è Distribution survie :")
print(f"   Train : {y_train.mean():.1%}")
print(f"   Test  : {y_test.mean():.1%}")

print("\n" + "="*60)

‚úÇÔ∏è S√âPARATION TRAIN/TEST

üì¶ VERSION BASE :
   Train : (712, 13)
   Test  : (179, 13)

üì¶ VERSION INTERACTIONS :
   Train : (712, 19)
   Test  : (179, 19)

‚öñÔ∏è Distribution survie :
   Train : 38.3%
   Test  : 38.5%



In [17]:
# ============================================================
# STANDARDISATION
# ============================================================

from sklearn.preprocessing import StandardScaler

print("="*60)
print("üìè STANDARDISATION DES FEATURES")
print("="*60)

# Colonnes num√©riques de base
numeric_cols = ['Pclass', 'Age', 'Fare', 'FamilySize']

# VERSION BASE
scaler_base = StandardScaler()
X_train_base_scaled = X_train_base.copy()
X_test_base_scaled = X_test_base.copy()

X_train_base_scaled[numeric_cols] = scaler_base.fit_transform(X_train_base[numeric_cols])
X_test_base_scaled[numeric_cols] = scaler_base.transform(X_test_base[numeric_cols])

print(f"‚úÖ Base standardis√©e")

# VERSION INTERACTIONS (standardiser aussi les interactions)
numeric_cols_interact = numeric_cols + interaction_cols

scaler_interact = StandardScaler()
X_train_interact_scaled = X_train_interact.copy()
X_test_interact_scaled = X_test_interact.copy()

X_train_interact_scaled[numeric_cols_interact] = scaler_interact.fit_transform(
    X_train_interact[numeric_cols_interact]
)
X_test_interact_scaled[numeric_cols_interact] = scaler_interact.transform(
    X_test_interact[numeric_cols_interact]
)

print(f"‚úÖ Interactions standardis√©es")

print("\n" + "="*60)
print("üì¶ R√âCAPITULATIF DES DATASETS")
print("="*60)
print("\nüå≥ Pour Random Forest, XGBoost :")
print(f"   ‚Üí X_train_base ({X_train_base.shape[1]} features)")
print(f"   ‚Üí X_test_base ({X_test_base.shape[1]} features)")

print("\nüìà Pour Logistic Regression, SVM :")
print(f"   ‚Üí X_train_interact_scaled ({X_train_interact_scaled.shape[1]} features)")
print(f"   ‚Üí X_test_interact_scaled ({X_test_interact_scaled.shape[1]} features)")

print("\nüéØ Target :")
print(f"   ‚Üí y_train ({len(y_train)} samples)")
print(f"   ‚Üí y_test ({len(y_test)} samples)")

print("="*60)
print("‚úÖ PR√äT POUR L'ENTRA√éNEMENT !")
print("="*60)

üìè STANDARDISATION DES FEATURES
‚úÖ Base standardis√©e
‚úÖ Interactions standardis√©es

üì¶ R√âCAPITULATIF DES DATASETS

üå≥ Pour Random Forest, XGBoost :
   ‚Üí X_train_base (13 features)
   ‚Üí X_test_base (13 features)

üìà Pour Logistic Regression, SVM :
   ‚Üí X_train_interact_scaled (19 features)
   ‚Üí X_test_interact_scaled (19 features)

üéØ Target :
   ‚Üí y_train (712 samples)
   ‚Üí y_test (179 samples)
‚úÖ PR√äT POUR L'ENTRA√éNEMENT !


‚úÖ X_train_base, X_test_base ‚Üí Pour Random Forest, XGBoost (pas d'interactions n√©cessaires)
‚úÖ X_train_base_scaled, X_test_base_scaled ‚Üí Pour mod√®les lin√©aires version simple
‚úÖ X_train_interact_scaled, X_test_interact_scaled ‚Üí Pour Logistic/SVM avec interactions
‚úÖ y_train, y_test ‚Üí Target identique pour tous

In [19]:
# ============================================================
# BASELINE : PR√âDIRE TOUJOURS LA CLASSE MAJORITAIRE
# ============================================================

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("="*70)
print("üìä BASELINE - PR√âDIRE TOUJOURS LA CLASSE MAJORITAIRE")
print("="*70)

# Classe majoritaire dans le train
baseline_prediction = y_train.mode()[0]
print(f"\nClasse majoritaire : {baseline_prediction} ({'D√©c√©d√©' if baseline_prediction == 0 else 'Surv√©cu'})")

# Pr√©dire toujours cette classe
y_pred_baseline = [baseline_prediction] * len(y_test)

# Accuracy baseline
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)

print(f"\nüéØ BASELINE ACCURACY : {baseline_accuracy:.4f} ({baseline_accuracy*100:.2f}%)")
print(f"\nüí° Objectif : Nos mod√®les doivent faire MIEUX que {baseline_accuracy*100:.2f}% !")

print("\n" + "="*70)

üìä BASELINE - PR√âDIRE TOUJOURS LA CLASSE MAJORITAIRE

Classe majoritaire : 0 (D√©c√©d√©)

üéØ BASELINE ACCURACY : 0.6145 (61.45%)

üí° Objectif : Nos mod√®les doivent faire MIEUX que 61.45% !



In [20]:
# ============================================================
# MOD√àLE 1 : LOGISTIC REGRESSION (avec interactions)
# ============================================================

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print("="*70)
print("üìà MOD√àLE 1 : LOGISTIC REGRESSION (avec interactions)")
print("="*70)

# Entra√Æner
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_interact_scaled, y_train)

# Pr√©dictions
y_pred_lr_train = lr_model.predict(X_train_interact_scaled)
y_pred_lr_test = lr_model.predict(X_test_interact_scaled)

# Probabilit√©s (pour ROC-AUC)
y_pred_lr_proba = lr_model.predict_proba(X_test_interact_scaled)[:, 1]

# M√©triques
lr_train_acc = accuracy_score(y_train, y_pred_lr_train)
lr_test_acc = accuracy_score(y_test, y_pred_lr_test)
lr_roc_auc = roc_auc_score(y_test, y_pred_lr_proba)

print(f"\nüìä R√âSULTATS :")
print(f"   Accuracy Train : {lr_train_acc:.4f} ({lr_train_acc*100:.2f}%)")
print(f"   Accuracy Test  : {lr_test_acc:.4f} ({lr_test_acc*100:.2f}%)")
print(f"   ROC-AUC Score  : {lr_roc_auc:.4f}")
print(f"   Overfitting    : {(lr_train_acc - lr_test_acc)*100:.2f}%")

print(f"\nüìã MATRICE DE CONFUSION :")
cm_lr = confusion_matrix(y_test, y_pred_lr_test)
print(cm_lr)
print(f"\n   TN={cm_lr[0,0]}, FP={cm_lr[0,1]}")
print(f"   FN={cm_lr[1,0]}, TP={cm_lr[1,1]}")

print(f"\nüìù CLASSIFICATION REPORT :")
print(classification_report(y_test, y_pred_lr_test, target_names=['D√©c√©d√©', 'Surv√©cu']))

print("="*70)

üìà MOD√àLE 1 : LOGISTIC REGRESSION (avec interactions)

üìä R√âSULTATS :
   Accuracy Train : 0.8413 (84.13%)
   Accuracy Test  : 0.8436 (84.36%)
   ROC-AUC Score  : 0.8755
   Overfitting    : -0.23%

üìã MATRICE DE CONFUSION :
[[99 11]
 [17 52]]

   TN=99, FP=11
   FN=17, TP=52

üìù CLASSIFICATION REPORT :
              precision    recall  f1-score   support

      D√©c√©d√©       0.85      0.90      0.88       110
     Surv√©cu       0.83      0.75      0.79        69

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



In [21]:
# ============================================================
# MOD√àLE 2 : RANDOM FOREST (sans interactions)
# ============================================================

from sklearn.ensemble import RandomForestClassifier

print("="*70)
print("üå≥ MOD√àLE 2 : RANDOM FOREST (sans interactions)")
print("="*70)

# Entra√Æner
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5
)
rf_model.fit(X_train_base, y_train)

# Pr√©dictions
y_pred_rf_train = rf_model.predict(X_train_base)
y_pred_rf_test = rf_model.predict(X_test_base)
y_pred_rf_proba = rf_model.predict_proba(X_test_base)[:, 1]

# M√©triques
rf_train_acc = accuracy_score(y_train, y_pred_rf_train)
rf_test_acc = accuracy_score(y_test, y_pred_rf_test)
rf_roc_auc = roc_auc_score(y_test, y_pred_rf_proba)

print(f"\nüìä R√âSULTATS :")
print(f"   Accuracy Train : {rf_train_acc:.4f} ({rf_train_acc*100:.2f}%)")
print(f"   Accuracy Test  : {rf_test_acc:.4f} ({rf_test_acc*100:.2f}%)")
print(f"   ROC-AUC Score  : {rf_roc_auc:.4f}")
print(f"   Overfitting    : {(rf_train_acc - rf_test_acc)*100:.2f}%")

print(f"\nüìã MATRICE DE CONFUSION :")
cm_rf = confusion_matrix(y_test, y_pred_rf_test)
print(cm_rf)
print(f"\n   TN={cm_rf[0,0]}, FP={cm_rf[0,1]}")
print(f"   FN={cm_rf[1,0]}, TP={cm_rf[1,1]}")

print(f"\nüìù CLASSIFICATION REPORT :")
print(classification_report(y_test, y_pred_rf_test, target_names=['D√©c√©d√©', 'Surv√©cu']))

# Feature Importance
print(f"\n‚≠ê TOP 10 FEATURES IMPORTANTES :")
feature_imp = pd.DataFrame({
    'Feature': X_train_base.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

for idx, row in feature_imp.head(10).iterrows():
    print(f"   {row['Feature']:25s} : {row['Importance']:.4f}")

print("="*70)

üå≥ MOD√àLE 2 : RANDOM FOREST (sans interactions)

üìä R√âSULTATS :
   Accuracy Train : 0.9199 (91.99%)
   Accuracy Test  : 0.8045 (80.45%)
   ROC-AUC Score  : 0.8642
   Overfitting    : 11.55%

üìã MATRICE DE CONFUSION :
[[95 15]
 [20 49]]

   TN=95, FP=15
   FN=20, TP=49

üìù CLASSIFICATION REPORT :
              precision    recall  f1-score   support

      D√©c√©d√©       0.83      0.86      0.84       110
     Surv√©cu       0.77      0.71      0.74        69

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


‚≠ê TOP 10 FEATURES IMPORTANTES :
   Fare                      : 0.2295
   Title_Simple_Mr           : 0.1661
   Sex_male                  : 0.1642
   Age                       : 0.1562
   Pclass                    : 0.0905
   FamilySize                : 0.0606
   Title_Simple_Miss         : 0.0402
   Title_Simple_Mrs          : 0.0347
   Large_Family        

In [22]:
# ============================================================
# MOD√àLE 3 : XGBOOST (sans interactions)
# ============================================================

from xgboost import XGBClassifier

print("="*70)
print("üöÄ MOD√àLE 3 : XGBOOST (sans interactions)")
print("="*70)

# Entra√Æner
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train_base, y_train)

# Pr√©dictions
y_pred_xgb_train = xgb_model.predict(X_train_base)
y_pred_xgb_test = xgb_model.predict(X_test_base)
y_pred_xgb_proba = xgb_model.predict_proba(X_test_base)[:, 1]

# M√©triques
xgb_train_acc = accuracy_score(y_train, y_pred_xgb_train)
xgb_test_acc = accuracy_score(y_test, y_pred_xgb_test)
xgb_roc_auc = roc_auc_score(y_test, y_pred_xgb_proba)

print(f"\nüìä R√âSULTATS :")
print(f"   Accuracy Train : {xgb_train_acc:.4f} ({xgb_train_acc*100:.2f}%)")
print(f"   Accuracy Test  : {xgb_test_acc:.4f} ({xgb_test_acc*100:.2f}%)")
print(f"   ROC-AUC Score  : {xgb_roc_auc:.4f}")
print(f"   Overfitting    : {(xgb_train_acc - xgb_test_acc)*100:.2f}%")

print(f"\nüìã MATRICE DE CONFUSION :")
cm_xgb = confusion_matrix(y_test, y_pred_xgb_test)
print(cm_xgb)
print(f"\n   TN={cm_xgb[0,0]}, FP={cm_xgb[0,1]}")
print(f"   FN={cm_xgb[1,0]}, TP={cm_xgb[1,1]}")

print(f"\nüìù CLASSIFICATION REPORT :")
print(classification_report(y_test, y_pred_xgb_test, target_names=['D√©c√©d√©', 'Surv√©cu']))

# Feature Importance
print(f"\n‚≠ê TOP 10 FEATURES IMPORTANTES :")
feature_imp_xgb = pd.DataFrame({
    'Feature': X_train_base.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

for idx, row in feature_imp_xgb.head(10).iterrows():
    print(f"   {row['Feature']:25s} : {row['Importance']:.4f}")

print("="*70)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# ============================================================
# COMPARAISON DES MOD√àLES
# ============================================================

print("="*70)
print("üìä COMPARAISON DES PERFORMANCES")
print("="*70)

results = pd.DataFrame({
    'Mod√®le': ['Baseline', 'Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy Test': [
        baseline_accuracy,
        lr_test_acc,
        rf_test_acc,
        xgb_test_acc
    ],
    'ROC-AUC': [
        0.5,
        lr_roc_auc,
        rf_roc_auc,
        xgb_roc_auc
    ],
    'Overfitting': [
        0,
        (lr_train_acc - lr_test_acc),
        (rf_train_acc - rf_test_acc),
        (xgb_train_acc - xgb_test_acc)
    ]
})

results = results.sort_values('Accuracy Test', ascending=False)
display(results)

print(f"\nüèÜ MEILLEUR MOD√àLE : {results.iloc[0]['Mod√®le']}")
print(f"   Accuracy : {results.iloc[0]['Accuracy Test']:.4f}")
print(f"   ROC-AUC  : {results.iloc[0]['ROC-AUC']:.4f}")

print("\n" + "="*70)