In [None]:
#Chargement des datasets
import numpy as np
import requests
import io

base_url = "https://raw.githubusercontent.com/lmuxz/SCDA/master/data/"
suffixes = ["test", "test_label", "train", "train_label"]

# Dictionnaire pour stocker les matrices numpy
data_store = {}

for i in range(4):
    for suffix in suffixes:
        file_name = f"kaggle_source_cate_{i}_{suffix}.npy"
        url = f"{base_url}{file_name}"

        response = requests.get(url)

        if response.status_code == 200:
            # numpy.load n√©cessite un objet "file-like", d'o√π l'utilisation de BytesIO
            data_store[file_name] = np.load(io.BytesIO(response.content))
            print(f"Charg√© : {file_name} | Forme : {data_store[file_name].shape}")
        else:
            print(f"√âchec pour {file_name} (Code : {response.status_code})")

# Exemple d'acc√®s :
# train_data_0 = data_store['kaggle_source_cate_0_train.npy']

In [None]:
# Passage en dataframe pandas
import pandas as pd

all_train_dfs = []
all_test_dfs = []
datasets_par_index = {}

for i in range(4):
    print(f"Traitement de l'index {i}...")

    # 1. R√©cup√©ration
    X_train_raw = data_store[f'kaggle_source_cate_{i}_train.npy']
    y_train_raw = data_store[f'kaggle_source_cate_{i}_train_label.npy']
    X_test_raw  = data_store[f'kaggle_source_cate_{i}_test.npy']
    y_test_raw  = data_store[f'kaggle_source_cate_{i}_test_label.npy']

    # 2. Conversion dynamique
    df_X_train = pd.DataFrame(X_train_raw).add_prefix('feat_')
    df_X_test  = pd.DataFrame(X_test_raw).add_prefix('feat_')

    # On cr√©e les noms de colonnes dynamiquement pour les labels (label_0, label_1, etc.)
    col_labels = [f'label_{j}' for j in range(y_train_raw.shape[1])]
    df_y_train = pd.DataFrame(y_train_raw, columns=col_labels)
    df_y_test  = pd.DataFrame(y_test_raw, columns=col_labels)

    # 3. Concat√©nation horizontale
    df_train_full = pd.concat([df_X_train, df_y_train], axis=1)
    df_test_full  = pd.concat([df_X_test, df_y_test], axis=1)

    # Identification de la source
    df_train_full['source_index'] = i
    df_test_full['source_index'] = i

    datasets_par_index[i] = {'train': df_train_full, 'test': df_test_full}
    all_train_dfs.append(df_train_full)
    all_test_dfs.append(df_test_full)

# 4. Fusion finale
df_final_train = pd.concat(all_train_dfs, ignore_index=True)
df_final_test  = pd.concat(all_test_dfs, ignore_index=True)

print(f"\nTermin√© ! Colonnes cr√©√©es pour les labels : {col_labels}")
print(f"Format final du train : {df_final_train.shape}")
print(f"Format final du train : {df_final_test.shape}")
print(df_final_train.head)

In [None]:
# Affichage des 5 premi√®res lignes des labels pour l'index 0
print("Aper√ßu des labels (index 0) :")
display(datasets_par_index[0]['train'][[f'label_{j}' for j in range(y_train_raw.shape[1])]].head())

# V√©rification des valeurs uniques pour voir si c'est du binaire (0/1)
print("\nValeurs uniques par colonne de label :")
for col in [f'label_{j}' for j in range(y_train_raw.shape[1])]:
    unique_vals = datasets_par_index[0]['train'][col].unique()
    print(f"{col} : {unique_vals}")

la colonne "label_1" servira de cible (1=fraude, 0= non fraude)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Calcul des proportions
counts = df_final_train['label_1'].value_counts()
proportions = df_final_train['label_1'].value_counts(normalize=True) * 100

print("--- R√©partition des classes ---")
for val, count in counts.items():
    print(f"Classe {val} : {count} individus ({proportions[val]:.2f}%)")

# 2. Cr√©ation du graphique
plt.figure(figsize=(8, 5))
sns.barplot(x=counts.index, y=counts.values, palette='viridis')

plt.title('R√©partition des classes (0 vs 1) dans le dataset')
plt.xlabel('Classe (Target)')
plt.ylabel('Nombre de lignes')
plt.xticks(ticks=[0, 1], labels=['Classe 0', 'Classe 1'])

# Ajout des pourcentages au-dessus des barres
for i, count in enumerate(counts):
    plt.text(i, count + (max(counts)*0.01), f'{proportions[i]:.1f}%', ha='center', fontweight='bold')

plt.show()

In [None]:
# 1. Calcul des proportions
counts = df_final_test['label_1'].value_counts()
proportions = df_final_test['label_1'].value_counts(normalize=True) * 100

print("--- R√©partition des classes ---")
for val, count in counts.items():
    print(f"Classe {val} : {count} individus ({proportions[val]:.2f}%)")

# 2. Cr√©ation du graphique
plt.figure(figsize=(8, 5))
sns.barplot(x=counts.index, y=counts.values, palette='viridis')

plt.title('R√©partition des classes (0 vs 1) dans le dataset')
plt.xlabel('Classe (Target)')
plt.ylabel('Nombre de lignes')
plt.xticks(ticks=[0, 1], labels=['Classe 0', 'Classe 1'])

# Ajout des pourcentages au-dessus des barres
for i, count in enumerate(counts):
    plt.text(i, count + (max(counts)*0.01), f'{proportions[i]:.1f}%', ha='center', fontweight='bold')

plt.show()

In [None]:
#v√©rification des variables si certaines sont des constantes
# 1. Identifier les colonnes 'feat_'
feat_cols = [c for c in df_final_train.columns if c.startswith('feat_')]

# 2. V√©rifier le nombre de valeurs uniques par colonne
# nunique() renvoie 1 si toutes les valeurs de la colonne sont identiques
constant_features = [col for col in feat_cols if df_final_train[col].nunique() <= 1]

# 3. Affichage des r√©sultats
print(f"Nombre total de descripteurs analys√©s : {len(feat_cols)}")
print(f"Nombre de descripteurs constants trouv√©s : {len(constant_features)}")

if len(constant_features) > 0:
    print("\nListe des descripteurs constants :")
    print(constant_features)

    # Optionnel : Supprimer ces colonnes des datasets
    # df_final_train.drop(columns=constant_features, inplace=True)
    # df_final_test.drop(columns=constant_features, inplace=True)
    # print("\nColonnes constantes supprim√©es des DataFrames.")
else:
    print("\nAucun descripteur constant d√©tect√©. Tous les 'feat_' varient !")

In [None]:
# 1. Calcul de la corr√©lation entre toutes les colonnes et la cible
# On se concentre uniquement sur la colonne 'target' (anciennement label_1)
correlations = df_final_train.corr()['label_1'].sort_values(ascending=False)

# 2. S√©paration des corr√©lations positives et n√©gatives fortes
# On exclut la cible elle-m√™me (qui a une corr√©lation de 1.0)
top_positives = correlations[correlations < 1.0].head(15)
top_negatives = correlations.tail(15)

print("--- Top 10 des corr√©lations positives (li√©es au '1') ---")
print(top_positives)

print("\n--- Top 10 des corr√©lations n√©gatives (li√©es au '0') ---")
print(top_negatives)

# 3. Visualisation des corr√©lations les plus importantes
plt.figure(figsize=(10, 8))
top_corr_features = pd.concat([top_positives, top_negatives])
sns.barplot(x=top_corr_features.values, y=top_corr_features.index, palette='coolwarm')
plt.title('Descripteurs les plus corr√©l√©s √† la cible (Target)')
plt.xlabel('Coefficient de Corr√©lation')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# 1. S√©lection des colonnes les plus pertinentes pour √©viter de surcharger la heatmap
# On prend les 15 descripteurs les plus corr√©l√©s √† la cible (en valeur absolue)
target_corr = df_final_train.corr()['label_1'].abs().sort_values(ascending=False)
top_features = target_corr.iloc[1:16].index.tolist() # On exclut la target elle-m√™me

# 2. Calcul de la matrice de corr√©lation pour ces colonnes + la target
corr_matrix = df_final_train[top_features + ['label_1']].corr()

# 3. Affichage de la table (Matrice)
print("Extrait de la matrice de corr√©lation (Top 15 features) :")
display(corr_matrix)

# 4. G√©n√©ration de la Heatmap
plt.figure(figsize=(12, 10))

# Masque pour ne pas afficher la moiti√© sup√©rieure (doublon sym√©trique)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix,
            mask=mask,
            annot=True,          # Affiche les valeurs
            fmt=".2f",           # 2 d√©cimales
            cmap='coolwarm',     # Bleu (n√©gatif) √† Rouge (positif)
            center=0,
            linewidths=.5,
            cbar_kws={"shrink": .8})

plt.title('Heatmap des corr√©lations (Top 15 features + Target)')
plt.show()

In [None]:
#identification de variables redondantes
# 1. Calcul de la matrice de corr√©lation absolue
# On utilise la valeur absolue car une corr√©lation de -0.95 est aussi redondante que 0.95
corr_matrix = df_final_train.corr().abs()

# 2. S√©lectionner la partie sup√©rieure de la matrice pour √©viter les doublons (A-B et B-A)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 3. Trouver les colonnes avec une corr√©lation sup√©rieure au seuil de 0.90
threshold = 0.90
redundant_pairs = [
    (column, row, upper.loc[row, column])
    for column in upper.columns
    for row in upper.index
    if upper.loc[row, column] > threshold
]

# 4. Affichage des r√©sultats
print(f"--- Recherche des variables redondantes (Seuil > {threshold}) ---")
if not redundant_pairs:
    print("Aucune paire de variables fortement corr√©l√©es n'a √©t√© trouv√©e.")
else:
    print(f"Nombre de paires trouv√©es : {len(redundant_pairs)}\n")
    for var1, var2, val in redundant_pairs:
        print(f"üî¥ {var1} et {var2} sont corr√©l√©es √† {val:.4f}")

# 5. Optionnel : Lister les colonnes √† supprimer
# On choisit de supprimer la deuxi√®me variable de chaque paire pour nettoyer le dataset
to_drop = [pair[0] for pair in redundant_pairs]
to_drop = list(set(to_drop)) # Supprimer les doublons dans la liste de suppression

print(f"\nSuggestion de colonnes √† supprimer ({len(to_drop)}) : {to_drop}")

In [None]:
#d√©finition des ensembles X_train, X_test, y_train et y_test
X_train= df_X_train
X_test=df_X_test
y_train=df_y_train['label_1']
y_test=df_y_test['label_1']

In [None]:
#test R√©gression Logistique
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, make_scorer
import numpy as np

# 1. Configuration du protocole de validation
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. D√©finition du mod√®le de base
# Note : On utilise le solver 'liblinear' car il supporte √† la fois 'l1' et 'l2'
lr_model = LogisticRegression(solver='liblinear', random_state=42)

# 3. Grille d'hyperparam√®tres
param_grid = {
    'C': np.logspace(-3, 2, 10),
    'penalty': ['l1', 'l2']
}

# 4. Initialisation de la recherche par grille (GridSearch)
# Optimisation bas√©e sur le score F1
grid_search = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

# 5. Entra√Ænement sur X_train et y_train
print("D√©but de l'optimisation des hyperparam√®tres...")
grid_search.fit(X_train, y_train.values.ravel())

# 6. R√©sultats et √©valuation
print(f"\nMeilleurs hyperparam√®tres : {grid_search.best_params_}")
print(f"Meilleur score F1 en validation crois√©e : {grid_search.best_score_:.4f}")

# Pr√©diction sur l'ensemble de test
y_pred = grid_search.predict(X_test)

print("\n--- Rapport de performance sur l'ensemble Test ---")
print(classification_report(y_test, y_pred))

# ===============================
# √âvaluation finale
# ===============================

# R√©cup√©ration du meilleur mod√®le
best_lr = grid_search.best_estimator_

# Pr√©dictions finales sur le jeu de test
y_pred_lr = best_lr.predict(X_test)

# M√©triques
precision_lr = precision_score(y_test, y_pred_lr, pos_label=1)
recall_lr    = recall_score(y_test, y_pred_lr, pos_label=1)
f1_lr        = f1_score(y_test, y_pred_lr, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_lr:.3f}")
print(f"Recall    (fraude): {recall_lr:.3f}")
print(f"F1-score  (fraude): {f1_lr:.3f}")

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, make_scorer

# Configuration
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid_rf = {
    'n_estimators': [100, 500],
    'max_depth': [10, 30, None]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=cv,
    scoring='f1',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train.values.ravel())
print(f"RF - Meilleurs param√®tres: {grid_rf.best_params_}")
print(f"RF - Score F1 Test: {grid_rf.score(X_test, y_test):.4f}")
# Pr√©diction sur l'ensemble de test
y_pred = grid_rf.predict(X_test)
print(classification_report(y_test, y_pred))

# ===============================
# Random Forest - √âvaluation finale
# ===============================

# Meilleur mod√®le RF
best_rf = grid_rf.best_estimator_

# Pr√©dictions finales sur le jeu de test
y_pred_rf = best_rf.predict(X_test)

# M√©triques
precision_rf = precision_score(y_test, y_pred_rf, pos_label=1)
recall_rf    = recall_score(y_test, y_pred_rf, pos_label=1)
f1_rf        = f1_score(y_test, y_pred_rf, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_rf:.3f}")
print(f"Recall    (fraude): {recall_rf:.3f}")
print(f"F1-score  (fraude): {f1_rf:.3f}")

In [None]:
# Version raffin√©e de Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Protocole de validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Grille affin√©e (Fine-tuning)
# On explore autour des limites pr√©c√©demment atteintes
param_grid_refined = {
    'n_estimators': [500, 600, 700],        # Exploration au-del√† de 500
    'max_depth': [30, 40, 50, None],        # Pr√©cision autour de la profondeur √©lev√©e
    'min_samples_split': [2, 5, 10],        # "Pas autour de z√©ro" pour la division des n≈ìuds
}

# 3. Initialisation
grid_rf_refined = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=param_grid_refined,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# 4. Entra√Ænement
print("Lancement du GridSearch affin√© pour Random Forest...")
grid_rf_refined.fit(X_train, y_train.values.ravel())

# 5. R√©sultats
print(f"\nMeilleurs param√®tres affin√©s : {grid_rf_refined.best_params_}")
print(f"Meilleur score F1 (Validation) : {grid_rf_refined.best_score_:.4f}")

# Score sur le Test Set
final_score = grid_rf_refined.score(X_test, y_test)
print(f"Score F1 final sur Test : {final_score:.4f}")

# Pr√©diction sur l'ensemble de test
y_pred = grid_rf_refined.predict(X_test)
print(classification_report(y_test, y_pred))

# ===============================
# Random Forest (raffin√©e)
# ===============================

from sklearn.metrics import precision_score, recall_score, f1_score

# Meilleur mod√®le RF raffin√©
best_rf_refined = grid_rf_refined.best_estimator_

# Pr√©dictions finales sur le jeu de test
y_pred_rf_refined = best_rf_refined.predict(X_test)

# M√©triques
precision_rf_refined = precision_score(y_test, y_pred_rf_refined, pos_label=1)
recall_rf_refined    = recall_score(y_test, y_pred_rf_refined, pos_label=1)
f1_rf_refined        = f1_score(y_test, y_pred_rf_refined, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_rf_refined:.3f}")
print(f"Recall    (fraude): {recall_rf_refined:.3f}")
print(f"F1-score  (fraude): {f1_rf_refined:.3f}")

In [None]:
#XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6, 9]
}

grid_xgb = GridSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    param_grid=param_grid_xgb,
    cv = cv,
    scoring='f1',
    n_jobs=-1
)

grid_xgb.fit(X_train, y_train.values.ravel())
print(f"XGB - Meilleurs param√®tres: {grid_xgb.best_params_}")
print(f"XGB - Score F1 Test: {grid_xgb.score(X_test, y_test):.4f}")
# Pr√©diction sur l'ensemble de test
y_pred = grid_xgb.predict(X_test)
print(classification_report(y_test, y_pred))

# ===============================
# XGBoost - √âvaluation finale 
# ===============================

from sklearn.metrics import precision_score, recall_score, f1_score

# Meilleur mod√®le XGB
best_xgb = grid_xgb.best_estimator_

# Pr√©dictions finales sur le jeu de test (seuil par d√©faut = 0.5)
y_pred_xgb = best_xgb.predict(X_test)

# M√©triques (classe positive = fraude = 1)
precision_xgb = precision_score(y_test, y_pred_xgb, pos_label=1)
recall_xgb    = recall_score(y_test, y_pred_xgb, pos_label=1)
f1_xgb        = f1_score(y_test, y_pred_xgb, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_xgb:.3f}")
print(f"Recall    (fraude): {recall_xgb:.3f}")
print(f"F1-score  (fraude): {f1_xgb:.3f}")

In [None]:
#SVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear']
}

grid_svm = GridSearchCV(
    SVC(random_state=42),
    param_grid=param_grid_svm,
    cv=cv,
    scoring='f1',
    n_jobs=-1
)

grid_svm.fit(X_train, y_train.values.ravel())
print(f"SVM - Meilleurs param√®tres: {grid_svm.best_params_}")
print(f"SVM - Score F1 Test: {grid_svm.score(X_test, y_test):.4f}")
# Pr√©diction sur l'ensemble de test
y_pred = grid_svm.predict(X_test)
print(classification_report(y_test, y_pred))

# ===============================
# SVM - √âvaluation finale
# ===============================

# Meilleur mod√®le SVM
best_svm = grid_svm.best_estimator_

# Pr√©dictions finales sur le jeu de test
y_pred_svm = best_svm.predict(X_test)

# M√©triques (classe positive = fraude = 1)
precision_svm = precision_score(y_test, y_pred_svm, pos_label=1)
recall_svm    = recall_score(y_test, y_pred_svm, pos_label=1)
f1_svm        = f1_score(y_test, y_pred_svm, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_svm:.3f}")
print(f"Recall    (fraude): {recall_svm:.3f}")
print(f"F1-score  (fraude): {f1_svm:.3f}")

In [None]:
# Pr√©diction sur l'ensemble de test
SVC_model=svc_model = SVC(C=10, kernel='linear')
SVC_model_fit=svc_model.fit(X_train, y_train)
y_pred = SVC_model_fit(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#KNN

from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# 1. D√©finition du pipeline : Mise √† l'√©chelle puis Classifieur
# Le pipeline traite les donn√©es dans l'ordre pour chaque pli de la validation crois√©e
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# 2. Configuration du protocole de validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 3. Grille d'hyperparam√®tres
param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7, 11],
    'knn__weights': ['uniform', 'distance']
}

# 4. Initialisation de la recherche par grille
grid_knn = GridSearchCV(
    knn_pipeline,
    param_grid=param_grid_knn,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# 5. Entra√Ænement
print("Entra√Ænement du k-NN avec StandardScaler...")
grid_knn.fit(X_train, y_train.values.ravel())

# 6. R√©sultats
print(f"\nMeilleurs param√®tres : {grid_knn.best_params_}")
print(f"Meilleur score F1 (Validation CV) : {grid_knn.best_score_:.4f}")

# √âvaluation finale sur l'ensemble Test
test_score = grid_knn.score(X_test, y_test)
print(f"Score F1 sur l'ensemble Test : {test_score:.4f}")

# Pr√©diction sur l'ensemble de test
y_pred = grid_knn.predict(X_test)
print(classification_report(y_test, y_pred))

# ===============================
# k-NN - √âvaluation finale
# ===============================

from sklearn.metrics import precision_score, recall_score, f1_score

# Meilleur mod√®le k-NN
best_knn = grid_knn.best_estimator_

# Pr√©dictions finales sur le jeu de test
y_pred_knn = best_knn.predict(X_test)

# M√©triques centr√©es sur la classe fraude (1)
precision_knn = precision_score(y_test, y_pred_knn, pos_label=1)
recall_knn    = recall_score(y_test, y_pred_knn, pos_label=1)
f1_knn        = f1_score(y_test, y_pred_knn, pos_label=1)

print("\n--- M√©triques finales ---")
print(f"Precision (fraude): {precision_knn:.3f}")
print(f"Recall    (fraude): {recall_knn:.3f}")
print(f"F1-score  (fraude): {f1_knn:.3f}")

In [None]:
#Application de la m√©thode de sous-√©chantillonnage
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 1. Application du sous-√©chantillonnage
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# 2. Entra√Ænement du mod√®le
model_under = XGBClassifier(learning_rate=0.1, max_depth=9, random_state=42)
model_under.fit(X_resampled, y_resampled)

# 3. √âvaluation
y_pred = model_under.predict(X_test)
print("--- R√©sultats Under-sampling ---")
print(classification_report(y_test, y_pred))

In [None]:
#Application de SMOTE

from imblearn.over_sampling import SMOTE


# 1. G√©n√©ration de donn√©es synth√©tiques
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# 2. Entra√Ænement du mod√®le
model_smote = XGBClassifier(learning_rate=0.1, max_depth=9, random_state=42)
model_smote.fit(X_resampled, y_resampled)

# 3. √âvaluation
y_pred = model_smote.predict(X_test)
print("--- R√©sultats SMOTE ---")
print(classification_report(y_test, y_pred))

In [None]:
#Application Class-Weighting


# 1. Calcul du poids (ratio class 0 / class 1)
counter = np.bincount(y_train)
ratio = counter[0] / counter[1]

# 2. Entra√Ænement avec pond√©ration
model_weighted = XGBClassifier(
    learning_rate=0.1,
    max_depth=9,
    scale_pos_weight=ratio, # Applique le poids ici
    random_state=42
)
model_weighted.fit(X_train, y_train)

# 3. √âvaluation
y_pred = model_weighted.predict(X_test)
print("--- R√©sultats Class-Weighting ---")
print(classification_report(y_test, y_pred))