## 1. Importation :

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier, GradientBoostingRegressor, GradientBoostingClassifier


## 2. Load Data : 

In [2]:
def load_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Le fichier {path} n'existe pas.")

    if not path.endswith(".csv"):
        raise ValueError("Format non support√© : seuls les fichiers .csv sont accept√©s.")

    print(f"Chargement du dataset : {path}")
    return pd.read_csv(path)

## 2. Data Understanding : 

In [3]:
def data_understanding(df, target_column=None):

    print("\nüîπ Shape:", df.shape)

    print("\nüîπ First 5 rows:")
    display(df.head())

    print("\nüîπ Last 5 rows:")
    display(df.tail())

    print("\nüîπ Info:")
    print(df.info())

    print("\nüîπ Data types:")
    print(df.dtypes)

    print("\nüîπ Missing values per column:")
    missing_vals = df.isnull().sum()
    display(missing_vals[missing_vals > 0])

    print("\nüîπ Percentage of missing values per column:")
    missing_percent = (df.isnull().mean() * 100).round(2)
    display(missing_percent[missing_percent > 0])

    print("\nüîπ Duplicate rows count:", df.duplicated().sum())
    if df.duplicated().sum() > 0:
        print("üîπ Duplicate rows:")
        display(df[df.duplicated(keep=False)])

    print("\nüîπ Target variable preview:")
    if target_column and target_column in df.columns:
        display(df[[target_column]].head())
    else:
        print("‚ö†Ô∏è Target column not found or not provided.")


    # -------------------------------------------------
    # D√©tection des colonnes num√©riques (une seule fois)
    # -------------------------------------------------
    numeric_cols = df.select_dtypes(include=np.number).columns

    print("\nüîπ Numeric columns:", list(numeric_cols))



    # -------------------------------------------------
    # D√©tection des outliers
    # -------------------------------------------------
    outlier_counts = {}

    if len(numeric_cols) > 0:
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            outlier_counts[col] = len(outliers)
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí impossible de d√©tecter les outliers.")

    print("\nüîπ Number of outliers per numeric column:")
    display(outlier_counts)



    # -------------------------------------------------
    # Histogrammes des variables num√©riques
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        print("\nüìä Distribution des variables num√©riques :")
        df[numeric_cols].hist(bins=30, figsize=(12, 8))
        plt.tight_layout()
        plt.show()
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí pas d‚Äôhistogrammes.")



    # -------------------------------------------------
    # Heatmap de corr√©lation
    # -------------------------------------------------
    if len(numeric_cols) > 1:
        corr = df[numeric_cols].corr().abs()
        mask = corr < 0.5

        plt.figure(figsize=(18, 14))
        ax = sns.heatmap(
            corr, mask=mask, cmap="coolwarm", annot=False,
            linewidths=0.5, 
            cbar_kws={'label': 'Force de corr√©lation'}
        )

        plt.title("Heatmap des corr√©lations (seulement |corr| > 0.5)", fontsize=16)

        # L√©gende explicative
        plt.text(
            x=0.02, y=1.12,
            s=(
                "L√©gende des couleurs :\n"
                "Rouge fonc√© ‚Üí Corr√©lation tr√®s positive (‚âà 0.8 √† 1.0)\n"
                "Bleu fonc√© ‚Üí Corr√©lation tr√®s n√©gative (‚âà -0.8 √† -1.0)\n"
                "Blanc ‚Üí Corr√©lation faible (< 0.5) ou masqu√©e"
            ),
            fontsize=12,
            transform=ax.transAxes,
            verticalalignment='top',
            bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="black", alpha=0.8)
        )

        plt.show()
    else:
        print("\n‚ö†Ô∏è Pas assez de colonnes num√©riques pour une heatmap.")



    # -------------------------------------------------
    # Barplot des valeurs manquantes
    # -------------------------------------------------
    missing = df.isnull().sum()
    missing = missing[missing > 0]

    if len(missing) > 0:
        plt.figure(figsize=(10, 5))
        missing.sort_values().plot(kind='barh')
        plt.title("Valeurs manquantes par colonne")
        plt.xlabel("Nombre de valeurs manquantes")
        plt.show()
    else:
        print("\nAucune valeur manquante.")



    # -------------------------------------------------
    # Boxplots pour visualiser les outliers
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        for col in numeric_cols:
            if df[col].dropna().nunique() > 1:
                plt.figure(figsize=(6, 3))
                sns.boxplot(x=df[col])
                plt.title(f"Boxplot ‚Äì {col}")
                plt.show()
            else:
                print(f"Impossible de tracer un boxplot pour {col} (pas assez de valeurs).")
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí pas de boxplots.")


    return outlier_counts


## 3. Data Preperation :

In [4]:
def cap_iqr(df, numeric_columns, factor=1.5):
    df = df.copy()
    for col in numeric_columns:
        if col not in df.columns:
            continue
        
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - factor * IQR
        upper = Q3 + factor * IQR
        
        df[col] = np.where(df[col] < lower, lower, df[col])
        df[col] = np.where(df[col] > upper, upper, df[col])
    
    return df



def data_preparation(df, target_column, apply_capping=False):
    df = df.copy()
    original_cols = df.columns.tolist()
    original_shape = df.shape

    # 1) Remove columns 100% NaN (except target)
    cols_to_drop = [col for col in df.columns 
                    if col != target_column and df[col].isna().all()]
    df = df.drop(columns=cols_to_drop)

    # 2) Remove zero variance columns
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    numeric_cols_no_target = [col for col in numeric_cols if col != target_column]

    zero_var_cols = []
    if numeric_cols_no_target:
        selector = VarianceThreshold(threshold=0.0)
        selector.fit(df[numeric_cols_no_target])
        zero_var_cols = [col for col, keep in zip(numeric_cols_no_target,
                                                  selector.get_support())
                         if not keep]
        df = df.drop(columns=zero_var_cols)

    # Ensure target exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' was removed!")

    # 3) Separate target
    y = df[target_column]
    X = df.drop(columns=[target_column])

    # Detect column types
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # 4) Apply IQR capping
    if apply_capping:
        X = cap_iqr(X, numeric_cols)

    # 5) Build pipeline
    numeric_pipeline = Pipeline([("scaler", StandardScaler())])
    categorical_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    X_prepared = preprocessor.fit_transform(X)

    # Output summary
    print("\n=== DATA PREPARATION SUMMARY ===")
    print(f"Shape before preparation: {original_shape}")
    print(f"Shape after preparation: {df.shape}")
    print(f"Columns removed: {cols_to_drop + zero_var_cols}")

    print("\nTarget preview:")
    display(y.head())

    print("\nX (before preprocessing):")
    display(X.head())

    print("\nX_prepared shape:", X_prepared.shape)
    print("\nPreprocessor used:")
    print(preprocessor)

    removed_cols = cols_to_drop + zero_var_cols

    return X, X_prepared, y, preprocessor, removed_cols


## 4. Modeling :

##  Regression :


In [3]:


from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor, GradientBoostingRegressor



def run_regression_models(X, y, preprocessor, test_size=0.2, random_state=42):
    """
    Pipeline complet pour entra√Æner plusieurs mod√®les de r√©gression
    et retourner leurs scores.
    """

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Liste des mod√®les
    models = {
        "RandomForest": RandomForestRegressor(random_state=42),
        "ExtraTrees": ExtraTreesRegressor(random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42),
        # "RLT_Regressor": RLTRegressor(...)  # √Ä ajouter si RLT dispo
    }

    results = {}

    for model_name, model in models.items():

        reg = Pipeline([
            ("preprocessing", preprocessor),
            ("regressor", model)
        ])

        # Train
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        # Metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5
        r2 = r2_score(y_test, y_pred)

        results[model_name] = {
            "RMSE": round(rmse, 4),
            "MSE": round(mse, 4),
            "R2-score": round(r2, 4)
        }

    return results



##  Classification :

In [5]:

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier
)


def run_classification_models(X, y, preprocessor, test_size=0.2, random_state=42):
    """
    Pipeline complet pour entra√Æner plusieurs mod√®les de classification
    et retourner leurs scores.
    """

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Liste des mod√®les √† tester
    models = {
        "RandomForest": RandomForestClassifier(random_state=42),
        "ExtraTrees": ExtraTreesClassifier(random_state=42),
        "GradientBoosting": GradientBoostingClassifier(random_state=42),
        # "RLT_Classifier": RLTClassifier(...)   # √Ä ajouter si tu impl√©mentes RLT
    }

    results = {}

    for model_name, model in models.items():

        clf = Pipeline([
            ("preprocessing", preprocessor),
            ("classifier", model)
        ])

        # Fit
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Scores
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        try:
            y_prob = clf.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_prob)
        except:
            auc = "N/A"

        results[model_name] = {
            "Accuracy": round(acc, 4),
            "F1-score": round(f1, 4),
            "ROC-AUC": auc if auc == "N/A" else round(auc, 4)
        }

    return results



##  RLT :

In [6]:
def run_rlt_model(X, y, preprocessor, task="regression", test_size=0.2, random_state=42):
    """
    Pipeline pour entra√Æner un mod√®le RLT (regression ou classification)
    et calculer les m√©triques.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state,
        stratify=y if task=="classification" else None
    )

    # Initialisation du mod√®le RLT (√† remplacer par ton impl√©mentation Python)
    if task=="regression":
        model = RLTRegressor(ntrees=100, nmin=2, muting_rate=0.5, k=2, alpha=0.25)
    else:
        model = RLTClassifier(ntrees=100, nmin=2, muting_rate=0.5, k=2, alpha=0.25)

    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("rlt", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    if task=="regression":
        from sklearn.metrics import mean_squared_error, r2_score
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5
        r2 = r2_score(y_test, y_pred)
        metrics = {"RMSE": round(rmse, 4), "R2": round(r2,4)}
    else:
        from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
        try:
            y_prob = pipe.predict_proba(X_test)[:,1]
            auc = round(roc_auc_score(y_test, y_prob),4)
        except:
            auc = "N/A"
        metrics = {
            "Accuracy": round(accuracy_score(y_test, y_pred),4),
            "F1-score": round(f1_score(y_test, y_pred, average="weighted"),4),
            "ROC-AUC": auc
        }

    return metrics
