## 1. Importation :

In [25]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier, GradientBoostingRegressor, GradientBoostingClassifier


## 2. Load Data : 

In [47]:
def load_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Le fichier {path} n'existe pas.")

    if not path.endswith(".csv"):
        raise ValueError("Format non support√© : seuls les fichiers .csv sont accept√©s.")

    print(f"Chargement du dataset : {path}")
    return pd.read_csv(path)

## 2. Data Understanding : 

In [50]:
def data_understanding(df, target_column=None):

    print("\nüîπ Shape:", df.shape)

    print("\nüîπ First 5 rows:")
    display(df.head())

    print("\nüîπ Last 5 rows:")
    display(df.tail())

    print("\nüîπ Info:")
    print(df.info())

    print("\nüîπ Data types:")
    print(df.dtypes)

    print("\nüîπ Missing values per column:")
    missing_vals = df.isnull().sum()
    display(missing_vals[missing_vals > 0])

    print("\nüîπ Percentage of missing values per column:")
    missing_percent = (df.isnull().mean() * 100).round(2)
    display(missing_percent[missing_percent > 0])

    print("\nüîπ Duplicate rows count:", df.duplicated().sum())
    if df.duplicated().sum() > 0:
        print("üîπ Duplicate rows:")
        display(df[df.duplicated(keep=False)])

    print("\nüîπ Target variable preview:")
    if target_column and target_column in df.columns:
        display(df[[target_column]].head())
    else:
        print(" Target column not found or not provided.")


    # -------------------------------------------------
    # D√©tection des colonnes num√©riques (une seule fois)
    # -------------------------------------------------
    numeric_cols = df.select_dtypes(include=np.number).columns

    print("\nüîπ Numeric columns:", list(numeric_cols))



    # -------------------------------------------------
    # D√©tection des outliers
    # -------------------------------------------------
    outlier_counts = {}

    if len(numeric_cols) > 0:
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            outlier_counts[col] = len(outliers)
    else:
        print("\n Aucun champ num√©rique ‚Üí impossible de d√©tecter les outliers.")

    print("\nüîπ Number of outliers per numeric column:")
    display(outlier_counts)



    # -------------------------------------------------
    # Histogrammes des variables num√©riques
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        print("\n Distribution des variables num√©riques :")
        df[numeric_cols].hist(bins=30, figsize=(12, 8))
        plt.tight_layout()
        plt.show()
    else:
        print("\n Aucun champ num√©rique ‚Üí pas d‚Äôhistogrammes.")



    # -------------------------------------------------
    # Heatmap de corr√©lation
    # -------------------------------------------------
    if len(numeric_cols) > 1:
        corr = df[numeric_cols].corr().abs()
        mask = corr < 0.5

        plt.figure(figsize=(18, 14))
        ax = sns.heatmap(
            corr, mask=mask, cmap="coolwarm", annot=False,
            linewidths=0.5, 
            cbar_kws={'label': 'Force de corr√©lation'}
        )

        plt.title("Heatmap des corr√©lations (seulement |corr| > 0.5)", fontsize=16)

        # L√©gende explicative
        plt.text(
            x=0.02, y=1.12,
            s=(
                "L√©gende des couleurs :\n"
                "Rouge fonc√© ‚Üí Corr√©lation tr√®s positive (‚âà 0.8 √† 1.0)\n"
                "Bleu fonc√© ‚Üí Corr√©lation tr√®s n√©gative (‚âà -0.8 √† -1.0)\n"
                "Blanc ‚Üí Corr√©lation faible (< 0.5) ou masqu√©e"
            ),
            fontsize=12,
            transform=ax.transAxes,
            verticalalignment='top',
            bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="black", alpha=0.8)
        )

        plt.show()
    else:
        print("\n Pas assez de colonnes num√©riques pour une heatmap.")



    # -------------------------------------------------
    # Barplot des valeurs manquantes
    # -------------------------------------------------
    missing = df.isnull().sum()
    missing = missing[missing > 0]

    if len(missing) > 0:
        plt.figure(figsize=(10, 5))
        missing.sort_values().plot(kind='barh')
        plt.title("Valeurs manquantes par colonne")
        plt.xlabel("Nombre de valeurs manquantes")
        plt.show()
    else:
        print("\n Aucune valeur manquante.")



    # -------------------------------------------------
    # Boxplots pour visualiser les outliers
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        for col in numeric_cols:
            if df[col].dropna().nunique() > 1:
                plt.figure(figsize=(6, 3))
                sns.boxplot(x=df[col])
                plt.title(f"Boxplot ‚Äì {col}")
                plt.show()
            else:
                print(f"Impossible de tracer un boxplot pour {col} (pas assez de valeurs).")
    else:
        print("\n Aucun champ num√©rique ‚Üí pas de boxplots.")


    return outlier_counts


## 3. Data Preperation :

In [54]:
def data_preparation(df, target_column, task="classification", apply_capping=True):
    df = df.copy()
    original_shape = df.shape

    # Supprimer colonnes 100% NaN
    nan_cols = [c for c in df.columns if c != target_column and df[c].isna().all()]
    df.drop(columns=nan_cols, inplace=True)
    df = df.dropna(subset=[target_column])

    # S√©paration X / y
    y = df[target_column]
    X = df.drop(columns=[target_column])

    # D√©tection types
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # Traitement capping
    if apply_capping and numeric_cols:
        X = cap_iqr_df(X, numeric_cols)

    # Suppression variance nulle
    zero_var_cols = []
    if numeric_cols:
        selector = VarianceThreshold(0.0)
        selector.fit(X[numeric_cols])
        zero_var_cols = [col for col, keep in zip(numeric_cols, selector.get_support()) if not keep]
        X.drop(columns=zero_var_cols, inplace=True)
        numeric_cols = [c for c in numeric_cols if c not in zero_var_cols]

    # Pipelines num√©riques et cat√©gorielles
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])
    X_prepared = preprocessor.fit_transform(X)

    # Encodage cible
    label_encoder = None
    if task == "classification":
        if y.dtype == "object" or y.dtype.name == "category":
            label_encoder = LabelEncoder()
            y_final = label_encoder.fit_transform(y)
        else:
            y_final = y.values
    elif task == "regression":
        y_final = y.values.astype(float)

    # Dataset nettoy√© final
    df_clean = pd.concat([X.reset_index(drop=True), y.reset_index(drop=True)], axis=1)

    return X_prepared, y_final, preprocessor, df_clean, label_encoder


In [56]:
def save_clean_dataset(name, df_clean, folder="clean_datasets", n_rows=150, random=True):
    os.makedirs(folder, exist_ok=True)
    path = os.path.join(folder, f"{name}_clean.csv")
    
    df_to_save = (
        df_clean.sample(n=min(n_rows, len(df_clean)), random_state=42)
        if random else
        df_clean.head(n_rows)
    )
    
    df_to_save.to_csv(path, index=False)
    print(f" Dataset sauvegard√© ({len(df_to_save)} observations) : {path}")

## 4. Modeling :

## 5. Evaluation :

## 6. Deploiment : 