## 1. Importation :

In [73]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier, GradientBoostingRegressor, GradientBoostingClassifier


## 2. Load Data : 

In [75]:
def load_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Le fichier {path} n'existe pas.")

    if not path.endswith(".csv"):
        raise ValueError("Format non support√© : seuls les fichiers .csv sont accept√©s.")

    print(f"Chargement du dataset : {path}")
    return pd.read_csv(path)

## 2. Data Understanding : 

In [77]:
def data_understanding(df, target_column=None):

    print("\nüîπ Shape:", df.shape)

    print("\nüîπ First 5 rows:")
    display(df.head())

    print("\nüîπ Last 5 rows:")
    display(df.tail())

    print("\nüîπ Info:")
    print(df.info())

    print("\nüîπ Data types:")
    print(df.dtypes)

    print("\nüîπ Missing values per column:")
    missing_vals = df.isnull().sum()
    display(missing_vals[missing_vals > 0])

    print("\nüîπ Percentage of missing values per column:")
    missing_percent = (df.isnull().mean() * 100).round(2)
    display(missing_percent[missing_percent > 0])

    print("\nüîπ Duplicate rows count:", df.duplicated().sum())
    if df.duplicated().sum() > 0:
        print("üîπ Duplicate rows:")
        display(df[df.duplicated(keep=False)])

    print("\nüîπ Target variable preview:")
    if target_column and target_column in df.columns:
        display(df[[target_column]].head())
    else:
        print("‚ö†Ô∏è Target column not found or not provided.")


    # -------------------------------------------------
    # D√©tection des colonnes num√©riques (une seule fois)
    # -------------------------------------------------
    numeric_cols = df.select_dtypes(include=np.number).columns

    print("\nüîπ Numeric columns:", list(numeric_cols))



    # -------------------------------------------------
    # D√©tection des outliers
    # -------------------------------------------------
    outlier_counts = {}

    if len(numeric_cols) > 0:
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            outlier_counts[col] = len(outliers)
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí impossible de d√©tecter les outliers.")

    print("\nüîπ Number of outliers per numeric column:")
    display(outlier_counts)



    # -------------------------------------------------
    # Histogrammes des variables num√©riques
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        print("\nüìä Distribution des variables num√©riques :")
        df[numeric_cols].hist(bins=30, figsize=(12, 8))
        plt.tight_layout()
        plt.show()
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí pas d‚Äôhistogrammes.")



    # -------------------------------------------------
    # Heatmap de corr√©lation
    # -------------------------------------------------
    if len(numeric_cols) > 1:
        corr = df[numeric_cols].corr().abs()
        mask = corr < 0.5

        plt.figure(figsize=(18, 14))
        ax = sns.heatmap(
            corr, mask=mask, cmap="coolwarm", annot=False,
            linewidths=0.5, 
            cbar_kws={'label': 'Force de corr√©lation'}
        )

        plt.title("Heatmap des corr√©lations (seulement |corr| > 0.5)", fontsize=16)

        # L√©gende explicative
        plt.text(
            x=0.02, y=1.12,
            s=(
                "L√©gende des couleurs :\n"
                "Rouge fonc√© ‚Üí Corr√©lation tr√®s positive (‚âà 0.8 √† 1.0)\n"
                "Bleu fonc√© ‚Üí Corr√©lation tr√®s n√©gative (‚âà -0.8 √† -1.0)\n"
                "Blanc ‚Üí Corr√©lation faible (< 0.5) ou masqu√©e"
            ),
            fontsize=12,
            transform=ax.transAxes,
            verticalalignment='top',
            bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="black", alpha=0.8)
        )

        plt.show()
    else:
        print("\n‚ö†Ô∏è Pas assez de colonnes num√©riques pour une heatmap.")



    # -------------------------------------------------
    # Barplot des valeurs manquantes
    # -------------------------------------------------
    missing = df.isnull().sum()
    missing = missing[missing > 0]

    if len(missing) > 0:
        plt.figure(figsize=(10, 5))
        missing.sort_values().plot(kind='barh')
        plt.title("Valeurs manquantes par colonne")
        plt.xlabel("Nombre de valeurs manquantes")
        plt.show()
    else:
        print("\nAucune valeur manquante.")



    # -------------------------------------------------
    # Boxplots pour visualiser les outliers
    # -------------------------------------------------
    if len(numeric_cols) > 0:
        for col in numeric_cols:
            if df[col].dropna().nunique() > 1:
                plt.figure(figsize=(6, 3))
                sns.boxplot(x=df[col])
                plt.title(f"Boxplot ‚Äì {col}")
                plt.show()
            else:
                print(f"Impossible de tracer un boxplot pour {col} (pas assez de valeurs).")
    else:
        print("\n‚ö†Ô∏è Aucun champ num√©rique ‚Üí pas de boxplots.")


    return outlier_counts


## 3. Data Preperation :

In [79]:
def cap_iqr(df, numeric_columns, factor=1.5):
    df = df.copy()
    for col in numeric_columns:
        if col not in df.columns:
            continue
        
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - factor * IQR
        upper = Q3 + factor * IQR
        
        df[col] = np.where(df[col] < lower, lower, df[col])
        df[col] = np.where(df[col] > upper, upper, df[col])
    
    return df



def data_preparation(df, target_column, apply_capping=False):
    df = df.copy()
    original_cols = df.columns.tolist()
    original_shape = df.shape

    # 1) Remove columns 100% NaN (except target)
    cols_to_drop = [col for col in df.columns 
                    if col != target_column and df[col].isna().all()]
    df = df.drop(columns=cols_to_drop)

    # 2) Remove zero variance columns
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    numeric_cols_no_target = [col for col in numeric_cols if col != target_column]

    zero_var_cols = []
    if numeric_cols_no_target:
        selector = VarianceThreshold(threshold=0.0)
        selector.fit(df[numeric_cols_no_target])
        zero_var_cols = [col for col, keep in zip(numeric_cols_no_target,
                                                  selector.get_support())
                         if not keep]
        df = df.drop(columns=zero_var_cols)

    # Ensure target exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' was removed!")

    # 3) Separate target
    y = df[target_column]
    X = df.drop(columns=[target_column])

    # Detect column types
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # 4) Apply IQR capping
    if apply_capping:
        X = cap_iqr(X, numeric_cols)

    # 5) Build pipeline
    numeric_pipeline = Pipeline([("scaler", StandardScaler())])
    categorical_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    X_prepared = preprocessor.fit_transform(X)

    # Output summary
    print("\n=== DATA PREPARATION SUMMARY ===")
    print(f"Shape before preparation: {original_shape}")
    print(f"Shape after preparation: {df.shape}")
    print(f"Columns removed: {cols_to_drop + zero_var_cols}")

    print("\nTarget preview:")
    display(y.head())

    print("\nX (before preprocessing):")
    display(X.head())

    print("\nX_prepared shape:", X_prepared.shape)
    print("\nPreprocessor used:")
    print(preprocessor)

    removed_cols = cols_to_drop + zero_var_cols

    return X, X_prepared, y, preprocessor, removed_cols


## 4. Modeling :

### Detect feature columns and prepare pipline :

In [82]:
def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # treat low-cardinality int as categorical
    for col in numeric_cols[:]:
        if X[col].nunique() <= 10:
            numeric_cols.remove(col)
            categorical_cols.append(col)

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])


    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    return preprocessor


In [83]:
def run_for_dataset(name, df_data):
    print(f"\n=== Dataset: {name} ===")

    target_col = TARGET_COLS[name]
    task = TASKS[name]

    X = df_data.drop(columns=[target_col])
    y = df_data[target_col]

    # encode target for classification
    if task == 'classification' and y.dtype == 'object':
        y = pd.factorize(y)[0]

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # build preprocessing pipeline
    preprocessor = build_preprocessor(X_train)

    results = []

    # ---- 1) RANDOM FOREST ----
    model = RandomForestRegressor if task=='regression' else RandomForestClassifier
    rf = Pipeline([
        ('pre', preprocessor),
        ('model', model(n_estimators=200, random_state=42, n_jobs=-1))
    ])
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_score = rf.predict_proba(X_test) if hasattr(rf.named_steps['model'], "predict_proba") else None

    res = evaluate_regression(y_test, y_pred) if task=='regression' \
          else evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'RandomForest'})
    results.append(res)
    print('RandomForest ->', res)

    # ---- 2) EXTRA TREES ----
    model = ExtraTreesRegressor if task=='regression' else ExtraTreesClassifier
    et = Pipeline([
        ('pre', preprocessor),
        ('model', model(n_estimators=200, random_state=42, n_jobs=-1))
    ])
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    y_score = et.predict_proba(X_test) if hasattr(et.named_steps['model'], "predict_proba") else None

    res = evaluate_regression(y_test, y_pred) if task=='regression' \
          else evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'ExtraTrees'})
    results.append(res)
    print('ExtraTrees ->', res)

    # ---- 3) GRADIENT BOOSTING ----
    model = GradientBoostingRegressor if task=='regression' else GradientBoostingClassifier
    gb = Pipeline([
        ('pre', preprocessor),
        ('model', model(n_estimators=200, random_state=42))
    ])
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    y_score = gb.predict_proba(X_test) if hasattr(gb.named_steps['model'], "predict_proba") else None

    res = evaluate_regression(y_test, y_pred) if task=='regression' \
          else evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'GradientBoosting'})
    results.append(res)
    print('GradientBoosting ->', res)

    # ---- 4) RF sur log1p(y) ‚Äî seulement pour r√©gression ----
    if task == 'regression':
        y_train_log = np.log1p(y_train)
        rf_log = Pipeline([
            ('pre', preprocessor),
            ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
        ])
        rf_log.fit(X_train, y_train_log)
        y_pred = np.expm1(rf_log.predict(X_test))

        res = evaluate_regression(y_test, y_pred)
        res.update({'dataset': name, 'model': 'RF-log1p'})
        results.append(res)
        print('RF-log1p ->', res)

    df_res = pd.DataFrame(results)
    return df_res


## 5. Evaluation :

In [85]:
def evaluate_regression(y_true, y_pred):
    return {
        'rmse': mean_squared_error(y_true, y_pred, squared=False),
        'r2': r2_score(y_true, y_pred)
    }


def evaluate_classification(y_true, y_pred, y_score=None):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='binary' if len(np.unique(y_true))==2 else 'macro')
    
    auc = None
    try:
        if y_score is not None:
            auc = roc_auc_score(y_true, y_score[:,1] if y_score.ndim==2 else y_score)
    except:
        auc = None

    return {'accuracy': acc, 'f1': f1, 'auc': auc}


## 6. Deploiment : 