In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/insurance-recommendation-challenge/train_.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sex,marital_status,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,8NN1,7POT,...,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,join_year,age,total_products
0,0,F,M,1X1H,2A7I,T4MS,0,0,0,0,...,1,0,0,0,0,0,0,2019.0,33,2
1,1,F,M,UAOD,2A7I,T4MS,0,0,0,0,...,1,0,0,0,0,0,0,2019.0,39,2
2,2,M,U,748L,QZYX,90QI,0,0,0,0,...,0,0,0,0,0,0,1,2013.0,29,3
3,3,M,M,1X1H,BP09,56SI,0,0,0,0,...,1,0,0,0,0,0,0,2019.0,30,2
4,4,M,M,748L,NO3L,T4MS,0,0,0,0,...,0,0,0,1,1,0,0,2019.0,30,2


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Liste des colonnes produits
PRODUCT_COLUMNS = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ',
    'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL',
    'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3'
]

# Colonnes catégorielles à encoder
CATEGORICAL_COLUMNS = ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']

# Colonnes numériques à normaliser
NUMERICAL_COLUMNS = ['join_year', 'age', 'total_products']


def preprocess(df):
    """
    Preprocess the input DataFrame by standardizing date formats, extracting year information,
    calculating age, and dropping unnecessary columns.
    :param df: Input DataFrame with columns 'ID', 'join_date', 'birth_year', 'sex', etc.
    :return: Preprocessed DataFrame
    """

    # Convert 'join_date' to datetime, standardize
    df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')

    # Standardize 'sex' to uppercase
    df['sex'] = df['sex'].str.upper()

    # Extract 'join_year' from 'join_date' and calculate 'age'
    df['join_year'] = df['join_date'].dt.year
    current_year = 2020
    df['age'] = current_year - df['birth_year']

    # Drop unnecessary columns
    cols = list(df.columns)
    cols = [col for col in cols if col not in ['ID', 'join_date', 'birth_year']]
    df = df[cols]

    return df

def prepare_for_training(df, fit_encoders=True, encoder=None, scaler=None):
    """
    Prépare les données pour l'entraînement d'un modèle de classification multi-label en utilisant One-Hot Encoding
    et la normalisation des variables numériques.

    Etapes:
    1. Supprimer la colonne 'Unnamed: 0' si elle est présente.

    2. Gérer les valeurs manquantes:
        - Pour les colonnes catégorielles, remplacer les NaN par 'UNKNOWN'.
        - Pour les colonnes numériques, remplacer les NaN par la médiane de la colonne.

    3. Séparer les colonnes catégorielles et numériques.

    4. Appliquer le One-Hot Encoding aux colonnes catégorielles.

    5. Normaliser les colonnes numériques avec StandardScaler.

    :param df: DataFrame préprocessé
    :param fit_encoders: Si True, ajuste les encoders sur les données
    :param encoder: OneHotEncoder déjà ajusté (pour le mode test)
    :param scaler: Scaler déjà ajusté (pour le mode test)
    :return: X (features), y (labels), encoder, scaler
    """

    df = df.copy()

    # 1. Supprimer 'Unnamed: 0' si présent
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    # 2. Gestion des valeurs manquantes
    for col in CATEGORICAL_COLUMNS:
        if col in df.columns:
            df[col] = df[col].fillna('UNKNOWN').astype(str)

    for col in NUMERICAL_COLUMNS:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # 3. Séparer colonnes catégorielles et numériques
    cat_cols_present = [col for col in CATEGORICAL_COLUMNS if col in df.columns]
    num_cols_present = [col for col in NUMERICAL_COLUMNS if col in df.columns]

    # 4. One-Hot Encoding des variables catégorielles
    if fit_encoders:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        cat_encoded = encoder.fit_transform(df[cat_cols_present])
    else:
        cat_encoded = encoder.transform(df[cat_cols_present])

    cat_encoded_df = pd.DataFrame(
        cat_encoded,
        columns=encoder.get_feature_names_out(cat_cols_present),
        index=df.index
    )

    # 5. Normalisation des variables numériques
    if fit_encoders:
        scaler = StandardScaler()
        num_scaled = scaler.fit_transform(df[num_cols_present])
    else:
        num_scaled = scaler.transform(df[num_cols_present])

    num_scaled_df = pd.DataFrame(
        num_scaled,
        columns=num_cols_present,
        index=df.index
    )

    # 6. Combiner features
    X = pd.concat([num_scaled_df, cat_encoded_df], axis=1)

    # 7. Extraire labels
    label_columns = [col for col in PRODUCT_COLUMNS if col in df.columns]
    y = df[label_columns] if label_columns else None

    return X, y, encoder, scaler

In [3]:

from sklearn.model_selection import train_test_split


def split_data(df, test_size=0.2, random_state=42):
    """
    Sépare le DataFrame en ensembles d'entraînement et de test.

    :param df: DataFrame complet
    :param test_size: Proportion des données pour le test (défaut: 0.2)
    :param random_state: Seed pour la reproductibilité (défaut: 42)
    :return: df_train, df_test
    """
    df_train, df_test = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        shuffle=True
    )

    return df_train.reset_index(drop=True), df_test.reset_index(drop=True)


df_train, df_test = split_data(df)

In [4]:
# Préparer pour l'entraînement
X_train, y_train, encoder, scaler = prepare_for_training(df_train, fit_encoders=True)

# Pour les données de test (utiliser les mêmes encoders)
X_test, y_test, _, _ = prepare_for_training(df_test, fit_encoders=False, encoder=encoder, scaler=scaler)

In [5]:
X_test.head()

Unnamed: 0,join_year,age,total_products,sex_F,sex_M,marital_status_D,marital_status_M,marital_status_P,marital_status_R,marital_status_S,...,occupation_code_ZCQR,occupation_code_ZHC2,occupation_code_ZKQ3,occupation_code_ZWPL,occupation_category_code_56SI,occupation_category_code_90QI,occupation_category_code_AHH5,occupation_category_code_JD7X,occupation_category_code_L44T,occupation_category_code_T4MS
0,0.668407,0.379659,-0.464141,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.145822,1.133025,6.143558,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.145822,-0.373706,1.187784,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.145822,-1.342319,-0.464141,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.145822,-1.449943,-0.464141,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
import numpy as np
import pickle
from pathlib import Path

# Sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    hamming_loss, accuracy_score
)

# XGBoost & CatBoost
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

PRODUCT_COLUMNS = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ',
    'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL',
    'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3'
]


# ============================================================================
# CONFIGURATION
# ============================================================================

RANDOM_STATE = 42
MODELS_DIR = Path("/kaggle/working/models")
MODELS_DIR.mkdir(exist_ok=True)


# ============================================================================
# FONCTIONS D'ÉVALUATION
# ============================================================================

def evaluate_multilabel(y_true, y_pred, model_name="Model"):
    """
    Évalue les performances d'un modèle de classification multi-label.
    """
    results = {
        "model": model_name,
        "f1_micro": f1_score(y_true, y_pred, average='micro'),
        "f1_macro": f1_score(y_true, y_pred, average='macro'),
        "f1_weighted": f1_score(y_true, y_pred, average='weighted'),
        "precision_micro": precision_score(y_true, y_pred, average='micro'),
        "recall_micro": recall_score(y_true, y_pred, average='micro'),
        "hamming_loss": hamming_loss(y_true, y_pred),
        "subset_accuracy": accuracy_score(y_true, y_pred),
    }

    print(f"\n{'-'*50}")
    print(f"Résultats pour {model_name}")
    print(f"{'-'*50}")
    print(f"F1 Score (micro):     {results['f1_micro']:.4f}")
    print(f"F1 Score (macro):     {results['f1_macro']:.4f}")
    print(f"F1 Score (weighted):  {results['f1_weighted']:.4f}")
    print(f"Precision (micro):    {results['precision_micro']:.4f}")
    print(f"Recall (micro):       {results['recall_micro']:.4f}")
    print(f"Hamming Loss:         {results['hamming_loss']:.4f}")
    print(f"Subset Accuracy:      {results['subset_accuracy']:.4f}")

    return results


def save_model(model, name, encoder=None, scaler=None):
    """Sauvegarde un modèle et ses transformers."""
    model_path = MODELS_DIR / f"{name}.pkl"

    with open(model_path, 'wb') as f:
        pickle.dump({
            'model': model,
            'encoder': encoder,
            'scaler': scaler
        }, f)

    print(f"Modèle sauvegardé: {model_path}")


# ============================================================================
# MODÈLE 1: RANDOM FOREST
# ============================================================================

def train_random_forest(X_train, y_train, X_test, y_test):
    """
    Entraîne un Random Forest pour classification multi-label.
    """
    print("\n Entraînement Random Forest...")

    rf_base = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight='balanced'
    )

    model = MultiOutputClassifier(rf_base, n_jobs=-1)
    model.fit(X_train, y_train)

    # Prédictions
    y_pred = model.predict(X_test)

    # Évaluation
    results = evaluate_multilabel(y_test, y_pred, "Random Forest")

    return model, results


# ============================================================================
# MODÈLE 2: XGBOOST
# ============================================================================

def train_xgboost(X_train, y_train, X_test, y_test):
    """
    Entraîne un XGBoost pour classification multi-label.
    """
    print("\n Entraînement XGBoost...")

    xgb_base = XGBClassifier(
        n_estimators=200,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        eval_metric='logloss',
        use_label_encoder=False
    )

    model = MultiOutputClassifier(xgb_base, n_jobs=-1)
    model.fit(X_train, y_train)

    # Prédictions
    y_pred = model.predict(X_test)

    # Évaluation
    results = evaluate_multilabel(y_test, y_pred, "XGBoost")

    return model, results


# ============================================================================
# MODÈLE 3: CATBOOST
# ============================================================================

def train_catboost(X_train, y_train, X_test, y_test):
    """
    Entraîne un CatBoost pour classification multi-label.
    """
    print("\n Entraînement CatBoost...")

    cb_base = CatBoostClassifier(
        iterations=200,
        depth=8,
        learning_rate=0.1,
        random_seed=RANDOM_STATE,
        verbose=False,
        auto_class_weights='Balanced'
    )

    model = MultiOutputClassifier(cb_base, n_jobs=-1)
    model.fit(X_train, y_train)

    # Prédictions
    y_pred = model.predict(X_test)

    # Évaluation
    results = evaluate_multilabel(y_test, y_pred, "CatBoost")

    return model, results


# ============================================================================
# MODÈLE 4: DEEP LEARNING (TENSORFLOW)
# ============================================================================

def build_deep_model(input_dim, output_dim):
    """
    Construit un réseau de neurones pour classification multi-label.
    """
    model = Sequential([
        Input(shape=(input_dim,)),

        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(32, activation='relu'),
        Dropout(0.2),

        # Couche de sortie: sigmoid pour multi-label
        Dense(output_dim, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    return model


def train_deep_learning(X_train, y_train, X_test, y_test):
    """
    Entraîne un modèle de Deep Learning pour classification multi-label.
    """
    print("\n Entraînement Deep Learning (TensorFlow)...")

    # Conversion en arrays numpy
    X_train_np = X_train.values.astype(np.float32)
    X_test_np = X_test.values.astype(np.float32)
    y_train_np = y_train.values.astype(np.float32)
    y_test_np = y_test.values.astype(np.float32)

    # Construction du modèle
    model = build_deep_model(X_train_np.shape[1], y_train_np.shape[1])

    print(model.summary())

    # Callbacks
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        )
    ]

    # Entraînement
    history = model.fit(
        X_train_np, y_train_np,
        validation_data=(X_test_np, y_test_np),
        epochs=100,
        batch_size=64,
        callbacks=callbacks,
        verbose=1
    )

    # Prédictions (seuil à 0.5)
    y_pred_proba = model.predict(X_test_np)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    # Évaluation
    results = evaluate_multilabel(y_test_np, y_pred, "Deep Learning (TensorFlow)")

    return model, history, results


def main():
    """
    Fonction principale pour entraîner tous les modèles.
    """
    print("-"*60)
    print(" ZIMNAT INSURANCE - CLASSIFICATION MULTI-LABEL")
    print("-"*60)

    # Stockage des résultats
    all_results = []

    # 1. Random Forest
    rf_model, rf_results = train_random_forest(X_train, y_train, X_test, y_test)
    save_model(rf_model, "random_forest", encoder, scaler)
    all_results.append(rf_results)

    # 2. XGBoost
    xgb_model, xgb_results = train_xgboost(X_train, y_train, X_test, y_test)
    save_model(xgb_model, "xgboost", encoder, scaler)
    all_results.append(xgb_results)

    # 3. CatBoost
    cb_model, cb_results = train_catboost(X_train, y_train, X_test, y_test)
    save_model(cb_model, "catboost", encoder, scaler)
    all_results.append(cb_results)

    # 4. Deep Learning
    dl_model, dl_history, dl_results = train_deep_learning(X_train, y_train, X_test, y_test)
    dl_model.save(MODELS_DIR / "deep_learning.keras")
    print(f"Modèle Deep Learning sauvegardé: {MODELS_DIR / 'deep_learning.keras'}")
    all_results.append(dl_results)

    # Résumé comparatif
    print("\n" + "-"*60)
    print("RÉSUMÉ COMPARATIF DES MODÈLES")
    print("-"*60)

    results_df = pd.DataFrame(all_results)
    results_df = results_df.set_index('model')
    print(results_df.round(4).to_string())

    # Meilleur modèle selon F1-micro
    best_model = results_df['f1_micro'].idxmax()
    print(f"\n Meilleur modèle (F1-micro): {best_model}")

    # Sauvegarder les résultats
    results_df.to_csv(MODELS_DIR / "model_comparison.csv")
    print(f"Résultats sauvegardés: {MODELS_DIR / 'model_comparison.csv'}")

    return results_df


if __name__ == "__main__":
    main()

2026-01-12 13:54:45.298310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768226085.475731      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768226085.524997      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768226085.939238      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768226085.939285      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768226085.939287      24 computation_placer.cc:177] computation placer alr

------------------------------------------------------------
 ZIMNAT INSURANCE - CLASSIFICATION MULTI-LABEL
------------------------------------------------------------

 Entraînement Random Forest...

--------------------------------------------------
Résultats pour Random Forest
--------------------------------------------------
F1 Score (micro):     0.6917
F1 Score (macro):     0.4177
F1 Score (weighted):  0.7917
Precision (micro):    0.5858
Recall (micro):       0.8443
Hamming Loss:         0.0812
Subset Accuracy:      0.4891
Modèle sauvegardé: /kaggle/working/models/random_forest.pkl

 Entraînement XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



--------------------------------------------------
Résultats pour XGBoost
--------------------------------------------------
F1 Score (micro):     0.8465
F1 Score (macro):     0.4513
F1 Score (weighted):  0.8248
Precision (micro):    0.8630
Recall (micro):       0.8305
Hamming Loss:         0.0325
Subset Accuracy:      0.6954
Modèle sauvegardé: /kaggle/working/models/xgboost.pkl

 Entraînement CatBoost...

--------------------------------------------------
Résultats pour CatBoost
--------------------------------------------------
F1 Score (micro):     0.7136
F1 Score (macro):     0.4155
F1 Score (weighted):  0.7910
Precision (micro):    0.6226
Recall (micro):       0.8358
Hamming Loss:         0.0723
Subset Accuracy:      0.4737
Modèle sauvegardé: /kaggle/working/models/catboost.pkl

 Entraînement Deep Learning (TensorFlow)...


I0000 00:00:1768226201.596237      24 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14497 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


None
Epoch 1/100


I0000 00:00:1768226206.016805     534 service.cc:152] XLA service 0x345e6cf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1768226206.016847     534 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1768226206.596232     534 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m 62/365[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.0895 - auc: 0.6537 - loss: 0.6243

I0000 00:00:1768226209.674015     534 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.3624 - auc: 0.8384 - loss: 0.3604 - val_accuracy: 0.8325 - val_auc: 0.9719 - val_loss: 0.1137 - learning_rate: 0.0010
Epoch 2/100
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7150 - auc: 0.9645 - loss: 0.1239 - val_accuracy: 0.8598 - val_auc: 0.9781 - val_loss: 0.0997 - learning_rate: 0.0010
Epoch 3/100
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7556 - auc: 0.9715 - loss: 0.1130 - val_accuracy: 0.8589 - val_auc: 0.9798 - val_loss: 0.0964 - learning_rate: 0.0010
Epoch 4/100
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7756 - auc: 0.9737 - loss: 0.1106 - val_accuracy: 0.8584 - val_auc: 0.9800 - val_loss: 0.0952 - learning_rate: 0.0010
Epoch 5/100
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8044 - auc: 0.9755 - loss: 0.1065

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
