# Importing


## Importing the dataset from the UCI repository

In [None]:
!wget https://archive.ics.uci.edu/static/public/2/adult.zip


In [None]:
!unzip adult.zip


Archive:  adult.zip
replace Index? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Importing the libraries

In [None]:
!pip install xgboost

In [None]:
!pip install tensorflow

In [None]:
!pip uninstall numpy -y
!pip install numpy==1.26.4
!pip install --upgrade --force-reinstall pandas scipy scikit-learn matplotlib seaborn



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb

## Importing dataset into variables

In [None]:

# Define column names based on the dataset description
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

# Load the dataset
train_data = pd.read_csv("adult.data", names=columns, sep=",\s*", engine="python") # les colonnes sont séparées par une virgule (,) suivie d'éventuels espaces (\s* en regex signifie "zéro ou plusieurs espaces") /  Utilise le moteur de parsing python car l'expression régulière (\s*) nécessite un moteur plus flexible que celui par défaut (c).
test_data = pd.read_csv("adult.test", names=columns, sep=",\s*", engine="python", skiprows=1) # ignore la premiere ligne du fichier ( entete: "|1x3 Cross-validation" )


# Exploring the dataset

## Statistical Analysis

In [None]:
# Fusionner les deux datasets pour une meilleure analyse
data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
# Afficher les types de données
print("\nTypes de données :")
print(data.dtypes)

In [None]:
# Statistiques des colonnes catégorielles
print("\nValeurs uniques par colonne catégorielle :")
for col in data.select_dtypes(include=["object"]).columns:
    print(f"{col}: {data[col].nunique()} valeurs uniques")


De l'affichage on constate:
  - age a deux valeurs uniques : on peut appliquer du one-hot encoding
  - income a 4 valeurs uniques dont 2 réellement : un **point** existe pour les deux autres ( >50K. et <=50K.) qu'on devra enlever

### Explication des statistiques spécifiques

- **Moyenne** : `stats.loc["mean"]`
- **Médiane** : `stats.loc["50%"]`
- **Écart-type** : `stats.loc["std"]` (élevé si les valeurs sont très dispersées).
- **Valeur minimale** : `stats.loc["min"]`
- **Valeur maximale** : `stats.loc["max"]`
- **1er quartile (25%)** : `stats.loc["25%"]`
- **3e quartile (75%)** : `stats.loc["75%"]`


In [None]:
# Statistiques descriptives des colonnes numériques
stats = data.describe()

# Afficher toutes les statistiques importantes
print("📊 Statistiques descriptives des variables numériques :\n")
print(stats[1:]) # dont need to show count because its the same

De l'affichage on constate que:
1. **Les variables `capital-gain` et `capital-loss` sont majoritairement nulles**.
2. **Le poids final (`fnlwgt`) a une grande dispersion**, une normalisation pourrait être nécessaire.  
3. **Les niveaux d’éducation sont concentrés autour de 9-12 ans**, ce qui peut aider à segmenter la population.  
4. **Les heures travaillées sont fortement centrées sur 40h**, ce qui est cohérent avec des emplois à temps plein.  

In [None]:
print("\nValeurs manquantes par colonne :")
print(data.isnull().sum())

# Vérifier les valeurs spéciales comme "?" qui sont parfois utilisées pour indiquer des données manquantes
print("\nValeurs '?' par colonne :")
for col in data.columns:
    print(f"{col}: {sum(data[col] == '?')}")


## Data vizualisation

In [None]:
# Display first few rows
train_data.head()

In [None]:
test_data.head()

In [None]:
# Tracer des histogrammes pour les variables numériques :  visualiser la fréquence d'apparition des différentes valeurs
data.hist(figsize=(12, 10), bins=30, edgecolor='black') # diviser les données en 30 intervalles
plt.suptitle("Distribution des variables numériques", fontsize=16)
plt.show()


du shéma on remarque que:
  - `capital-gain` et `capital-loss` sont fortement concentrées en 0 : ça peut affecter la classification du modèle , ce qui confirme l’intérêt de les transformer en variables binaires (`has_capital_gain`, `has_capital_loss`).  


In [None]:
categorical_columns = ["workclass", "education", "marital-status", "occupation", "race", "sex", "native-country"]

plt.figure(figsize=(12, 10))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(3, 3, i) # diviser la figure en une grille de 3 lignes et 3 colonnes (6 sous-graphiques au total).
    sns.countplot(data=data, y=col, order=data[col].value_counts().index, palette="pastel") # Trie les catégories en fonction de leur fréquence (les plus fréquentes en haut).
    plt.title(f"Répartition de {col}")
    plt.xlabel("Nombre d'individus")

plt.tight_layout() # eviter chevauchement des graphiques
plt.show()


Des graphiques ci-dessus on constate que:
  - Le sex male, la race white et la workclass private dominent dans le dataset.
  - La population est majoritairement diplomé d'un Bachelor ou plus
  - Présence de "?" indiquant des valeurs manquantes.
  - La population est majoritairement américaine.


In [None]:
sns.countplot(data=data, x="income", palette="coolwarm")
plt.title("Répartition des classes de revenus")
plt.xlabel("Revenu")
plt.ylabel("Nombre d'individus")
plt.show()


Ce graphique confirme notre remarque: les points devraient etre enlevés durant l'etape de nettoyage. (On les enleve maintenant pour visualisation seulement)

In [None]:
data["income"] = data["income"].str.replace(".", "", regex=False)

In [None]:
# age influence comment le revenu ?
# Create a pivot table for the heatmap
age_income_pivot = data.pivot_table(index="age", columns="income", aggfunc="size", fill_value=0)

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(age_income_pivot, cmap="coolwarm", annot=False, cbar=True)
plt.title("Répartition de l'âge selon le revenu")
plt.xlabel("Revenu")
plt.ylabel("Âge")
plt.show()



Du heatmap on remarque:
  - parmi les personnes gagnant <=50K, une forte concentration en ceux agés 23ans.
  - La majorité de la population gagnet <=50k et varient de 18 à 47ans.

In [None]:
# hours per week influence comment le revenu ?
plt.figure(figsize=(10, 5))
sns.boxplot(data=data, x="income", y="hours-per-week", palette="coolwarm")
plt.title("Heures travaillées par semaine en fonction du revenu")
plt.show()


From the boxplot we can visualize:
  - lower and upper whiskers : min and max
  - lower and upper quartiles Q1 and Q3 : horizantal limites of rectangle
  - median : horizontal half of rectangle

# Pre-processing the dataset

## Cleaning the dataset: missing values, duplicates

### Deleting Spaces in values of type Object (espaces inutiles au début et à la fin)


In [None]:
# Supprimer les espaces avant/après les valeurs
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # définit une fonction x qui verifie dabord si la colonne est de type object avant de supprimer les espaces
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


### Missing Values in columns: workclass, occupation and native-country

In [None]:
# let's see
train_data[['workclass', 'occupation', 'native-country']]

In [None]:
# Remplacer les "?" par NaN car pandas ne reconnait pas ? comme valeur manquante
train_data.replace("?", np.nan, inplace=True)
test_data.replace("?", np.nan, inplace=True)

# Vérifier les valeurs manquantes
print(train_data.isnull().sum())


In [None]:
# Remplacer les valeurs manquantes par la valeur la plus fréquente (mode)
columns = ['workclass', 'occupation', 'native-country']
for col in columns:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0]) # mode() retourne l'element le plus frequent en dataframe pandas, [0] our extraire la valeur
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

print(train_data.isnull().sum())  # Vérifier qu'il n'y a plus de NaN


### Fixing the income column


In [None]:
train_data["income"] = train_data["income"].astype(str).str.replace(".", "", regex=False)
test_data["income"] = test_data["income"].astype(str).str.replace(".", "", regex=False)

# test
test_data["income"]

### Let's check the duplicates

In [None]:
# verifying it theres duplicates
duplicates = train_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


In [None]:
# let's see if they are stricly identical
duplicate_rows = train_data[train_data.duplicated()]
duplicate_rows


We conclude that these rows represent different individuals --> not duplicates so we leave them as they are.

## Data Transformation: Encoding and scaling

### Binary Encoding
Previously during the statistical analysis
- **workclass** : 9 valeurs uniques  
- **education** : 16 valeurs uniques  
- **marital-status** : 7 valeurs uniques  
- **occupation** : 15 valeurs uniques  
- **relationship** : 6 valeurs uniques  
- **race** : 5 valeurs uniques  
- **sex** : 2 valeurs uniques  
- **native-country** : 42 valeurs uniques  
- **income** : 4 valeurs uniques  

In [None]:
print(train_data["income"].unique())  # Afficher toutes les valeurs uniques de la colonne
print(test_data["income"].unique())


In [None]:
le = LabelEncoder()
train_data["income"] = le.fit_transform(train_data["income"])
test_data["income"] = le.transform(test_data["income"])

# Check processed data
print(train_data["income"].unique())
print(test_data["income"].unique())

### One-hot encoding

In [None]:
# seeing how many unique values are there in each categorical column --> to apply dummie variables we need to make sure that we create the same number of new binary columns
print(len(train_data["native-country"].unique()))
print(len(test_data["native-country"].unique()))

#confirming
missing_cols = set(train_data.columns) - set(test_data.columns)
missing_cols

result is : 40 and 41 meaning not all values are present

In [None]:
train_data = pd.get_dummies(train_data, columns=['workclass', 'marital-status',  # get_dummies : transforme chaque catégorie en une nouvelle colonne binaire (0 ou 1).
                                                 'occupation', 'relationship', 'race',
                                                 'sex', 'native-country'], drop_first=True) # Évite la multicolinéarité en supprimant la première catégorie de chaque variable: réduit la redondance dans les données
test_data = pd.get_dummies(test_data, columns=['workclass', 'marital-status',
                                               'occupation', 'relationship', 'race',
                                               'sex', 'native-country'], drop_first=True)

we removed education column because education number is sufficient for classification

In [None]:
train_data = train_data.drop(columns=["education"])
test_data = test_data.drop(columns=["education"])

In [None]:
# lets verify
print("education" in train_data.columns)  # Doit afficher False
print("education" in test_data.columns)   # Doit afficher False

In [None]:
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0  # Ajouter les colonnes manquantes avec des valeurs 0

In [None]:
# confirming that it's fixed
print(len(test_data.columns)==len(train_data.columns))

### Creation of new variables to avoid missclassification

In [None]:
# Création des nouvelles variables binaires
train_data["has_capital_gain"] = (train_data["capital-gain"] > 0).astype(int)
train_data["has_capital_loss"] = (train_data["capital-loss"] > 0).astype(int)

test_data["has_capital_gain"] = (test_data["capital-gain"] > 0).astype(int)
test_data["has_capital_loss"] = (test_data["capital-loss"] > 0).astype(int)

# Suppression des colonnes originales
train_data.drop(columns=["capital-gain", "capital-loss"], inplace=True)
test_data.drop(columns=["capital-gain", "capital-loss"], inplace=True)


In [None]:
# Vérification que les colonnes ont bien été supprimées
print(train_data.head())
print(test_data.head())

### Scaling numerical values

On choisit le Standardscaler car on doit bien gérer les outliers et les différences d’échelle


In [None]:
# Sélection des colonnes numériques
num_cols = ["age", "fnlwgt", "education-num", "hours-per-week"]

# Initialiser le scaler
scaler = StandardScaler()

train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

In [None]:
# Vérification que les colonnes ont bien été normalisées
print(train_data.head())
print(test_data.head())


fixing one hot encoding from displating true and false to 0 and 1

In [None]:
cat_cols = train_data.select_dtypes(include=['bool']).columns  # Sélection des colonnes booléennes
train_data[cat_cols] = train_data[cat_cols].astype(int)
test_data[cat_cols] = test_data[cat_cols].astype(int)


In [None]:
print(train_data.head())
print(test_data.head())


réordonner les colonnes de la meme maniere

In [None]:
test_data = test_data[train_data.columns]
# Vérifier si les colonnes sont dans le même ordre
print("L'ordre est identique :", list(test_data.columns) == list(train_data.columns))

# Data Splittage

### Separating matrix of features X and target variable y

In [None]:
# Définir la variable cible
target_column = "income"

# Séparation des données en features (X) et labels (y)
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]

X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]


In [None]:
X_train

In [None]:
y_train

### This is optional:  because we uploaded from the repo with a ready 80% separation


In [None]:

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


# Model Evaluation

## Metrics and vizualisation

In [None]:
def evaluate_model(y_true, y_pred, y_prob):
    """
    Evaluate a classification model using Accuracy, Precision, Recall, F1-Score,
    Confusion Matrix, and ROC Curve.

    Parameters:
    - y_true: Actual labels
    - y_pred: Predicted labels
    - y_prob: Predicted probabilities (for ROC Curve)
    """

    # Compute Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Print Evaluation Metrics
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}")
    print(f"✅ Recall: {recall:.4f}")
    print(f"✅ F1-Score: {f1:.4f}\n")

    # Plot Confusion Matrix
    plt.figure(figsize=(5, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # Compute ROC Curve and AUC
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)

    # Plot ROC Curve
    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')  # Diagonal line
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

    return accuracy, precision, recall, f1, roc_auc

# Model Training

## Logistic Regression

In [None]:
# Initialize and train Logistic Regression model
model_lr = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
model_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = model_lr.predict(X_test)
y_prob_lr = model_lr.predict_proba(X_test)[:, 1]  # Probabilities for positive class


## KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)[:, 1]


## Naives Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
y_prob_nb = nb.predict_proba(X_test)[:, 1]

## SVM

In [None]:
# Train SVM Model (with probability=True to get probabilities for ROC curve)
svm_model = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)
y_prob_svm = svm_model.predict_proba(X_test)[:, 1]  # Probabilities for positive class

## XGboost

In [None]:
# Convert to XGBoost DMatrix format (optional but recommended)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',        # Logarithmic loss
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100,
    'random_state': 42
}

# Train the XGBoost classifier
xgb_model = xgb.XGBClassifier(**params)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

## Decision Trees

In [None]:
# Train Decision Tree Model
tree_model = DecisionTreeClassifier(max_depth=5, random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions
y_pred_tree = tree_model.predict(X_test)
y_prob_tree = tree_model.predict_proba(X_test)[:, 1]  # Probabilities for positive class

## Random Forests

In [None]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]  # Probabilities for positive class

## CatBoost

In [None]:
cat = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0)
cat.fit(X_train, y_train)

y_pred_cat = cat.predict(X_test)
y_prob_cat = cat.predict_proba(X_test)[:, 1]

## Light GBM

In [None]:
lgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)
y_prob_lgbm = lgbm.predict_proba(X_test)[:, 1]

## Neural Network




### Building the model

In [None]:
# Build Neural Network Model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    layers.Dense(32, activation='relu'),  # Hidden layer 1
    layers.Dense(16, activation='relu'),  # Hidden layer 2
    layers.Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Make predictions
y_prob_nn = model.predict(X_test).flatten()  # Probabilities for the positive class
y_pred_nn = (y_prob_nn > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate the model
evaluate_model(y_test, y_pred_nn, y_prob_nn)

### Viewing loss function history

In [None]:
# Plot Training Loss & Accuracy
plt.figure(figsize=(12, 4))

# Loss Curve
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

# Accuracy Curve
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training & Validation Accuracy')
plt.legend()

plt.show()


# Optimisation

## Feature Importance

In [None]:
# Get feature importance of random forest
feature_importance = rf_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

plt.figure(figsize=(10, 5))
plt.bar(range(X_train.shape[1]), feature_importance[sorted_idx], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_idx], rotation=90)
plt.xlabel("Feature Importance")
plt.ylabel("Importance Score")
plt.title("Feature Importance in Random Forest")
plt.show()


## Cross-Validation using Stratified K-Fold Cross-Validation

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
cv_scores_lr = cross_val_score(model_lr, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("Mean Logistic Regression accuracy:", cv_scores_lr.mean())


In [None]:
# cv_scores_svm = cross_val_score(svm_model, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

# print("Mean SVM accuracy:", cv_scores_svm.mean())


In [None]:
cv_scores_stratified = cross_val_score(rf_model, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("Mean Stratified K-Fold accuracy for Random Forest:", cv_scores_stratified.mean())


In [None]:
cv_scores_knn = cross_val_score(knn, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("KNN - Mean Accuracy:", np.mean(cv_scores_knn))

In [None]:
cv_scores_nb = cross_val_score(nb, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("Naïve Bayes - Mean Accuracy:", np.mean(cv_scores_nb))

In [None]:
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("XGBoost - Mean Accuracy:", np.mean(cv_scores_xgb))

In [None]:
cv_scores_dt = cross_val_score(tree_model, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("Decision Tree - Mean Accuracy:", np.mean(cv_scores_dt))

In [None]:
cv_scores_cb = cross_val_score(cat, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("CatBoost - Mean Accuracy:", np.mean(cv_scores_cb))

In [None]:
cv_scores_lgbm = cross_val_score(lgbm, X_train, y_train, cv=stratified_kfold, scoring='accuracy')

print("LightGBM - Mean Accuracy:", np.mean(cv_scores_lgbm))

## GridSearch

In [None]:
param_grid_dt = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Best parameters
print("Best Parameters for Decision Tree:", grid_search_dt.best_params_)

# Best model
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
y_prob_dt = best_dt.predict_proba(X_test)[:, 1]

# Evaluation
evaluate_model(y_test, y_pred_dt, y_prob_dt, "Decision Tree")


In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best parameters
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

# Best model
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_prob_rf = best_rf.predict_proba(X_test)[:, 1]

# Evaluation
evaluate_model(y_test, y_pred_rf, y_prob_rf, "Random Forest")


In [None]:
# Définition de la grille de paramètres
param_grid_cat = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8]
}

# Initialisation du modèle
cat_model = CatBoostClassifier(verbose=0)  # verbose=0 pour éviter trop de logs

# GridSearchCV
grid_search_cat = GridSearchCV(cat_model, param_grid_cat, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_cat.fit(X_train, y_train)

# Meilleurs paramètres
print("Best Parameters for CatBoost:", grid_search_cat.best_params_)

# Meilleur modèle
best_cat = grid_search_cat.best_estimator_

# Prédictions
y_pred_cat = best_cat.predict(X_test)
y_prob_cat = best_cat.predict_proba(X_test)[:, 1]

# Évaluation
evaluate_model(y_test, y_pred_cat, y_prob_cat, "CatBoost")


In [None]:
# Définition de la grille de paramètres
param_grid_lgbm = {
    'num_leaves': [10, 20, 31],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

# Initialisation du modèle
lgbm = lgb.LGBMClassifier()

# GridSearchCV
grid_search_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgbm.fit(X_train, y_train)

# Meilleurs paramètres
print("Best Parameters for LightGBM:", grid_search_lgbm.best_params_)

# Meilleur modèle
best_lgbm = grid_search_lgbm.best_estimator_

# Prédictions
y_pred_lgbm = best_lgbm.predict(X_test)
y_prob_lgbm = best_lgbm.predict_proba(X_test)[:, 1]

# Évaluation
evaluate_model(y_test, y_pred_lgbm, y_prob_lgbm, "LightGBM")


In [None]:
# Définition de la grille de paramètres
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],  # Nombre de voisins
    'weights': ['uniform', 'distance'],  # Poids des voisins
    'metric': ['euclidean', 'manhattan']  # Type de distance
}

# Initialisation du modèle
knn_model = KNeighborsClassifier()

# GridSearchCV
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train, y_train)  # Normalisation recommandée pour KNN

# Meilleurs paramètres
print("Best Parameters for KNN:", grid_search_knn.best_params_)

# Meilleur modèle
best_knn = grid_search_knn.best_estimator_

# Prédictions
y_pred_knn = best_knn.predict(X_test)
y_prob_knn = best_knn.predict_proba(X_test)[:, 1]

# Évaluation
evaluate_model(y_test, y_pred_knn, y_prob_knn, "KNN")


In [None]:
# Définition de la grille de paramètres
param_grid_nb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]  # Hyperparamètre pour la régularisation
}

# Initialisation du modèle
nb_model = GaussianNB()

# GridSearchCV
grid_search_nb = GridSearchCV(nb_model, param_grid_nb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_nb.fit(X_train, y_train)

# Meilleurs paramètres
print("Best Parameters for Naive Bayes:", grid_search_nb.best_params_)

# Meilleur modèle
best_nb = grid_search_nb.best_estimator_

# Prédictions
y_pred_nb = best_nb.predict(X_test)
y_prob_nb = best_nb.predict_proba(X_test)[:, 1]

# Évaluation
evaluate_model(y_test, y_pred_nb, y_prob_nb, "Naive Bayes")


In [None]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters for SVM:", grid_search.best_params_)

# Train best model
best_svm = grid_search.best_estimator_
y_pred_best_svm = best_svm.predict(X_test)
y_prob_best_svm = best_svm.predict_proba(X_test)[:, 1]

# Evaluate best model
evaluate_model(y_test, y_pred_best_svm, y_prob_best_svm)


In [None]:
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic'), param_grid_xgb, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train best model
best_xgb = grid_search.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test)
y_prob_best_xgb = best_xgb.predict_proba(X_test)[:, 1]

# Evaluate best model
evaluate_model(y_test, y_pred_best_xgb, y_prob_best_xgb)


# Results Analysis

##Comparing models' performances

In [None]:
print("Logistic Regression:\n")
evaluate_model(y_test, y_pred_best_lr, y_prob_best_lr)

In [None]:
# print("SVM:\n")
# evaluate_model(y_test, y_pred_best_svm, y_prob_best_svm)

In [None]:
print("XGboost:\n")
evaluate_model(y_test, y_pred_best_xgb, y_prob_best_xgb)

In [None]:
print("Decision Tree:\n")
evaluate_model(y_test, y_pred_best_dt, y_prob_best_dt)

In [None]:
print("Random Forest:\n")
evaluate_model(y_test, y_pred_best_rf, y_prob_best_fr)

In [None]:
print("Neural Network:\n")
evaluate_model(y_test, y_pred_nn, y_prob_nn)