In [45]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


In [46]:

# CONFIGURATION DES CHEMINS


BASE_DIR = r"C:\Users\s\RLT"
DATA_DIR = os.path.join(BASE_DIR, "datasets_augmented")
OUTPUT_DIR = os.path.join(BASE_DIR, "data_prepared")

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(" DATA_DIR :", DATA_DIR)
print(" OUTPUT_DIR :", OUTPUT_DIR)


 DATA_DIR : C:\Users\s\RLT\datasets_augmented
 OUTPUT_DIR : C:\Users\s\RLT\data_prepared


In [47]:
DATASETS = {
    "BreastCanDT": "BreastCanDT.csv",
    "auto-mpg": "auto-mpg.csv",
    "concrete_data": "concrete_data.csv",
    "HousingData": "HousingData.csv",
    "ozone": "ozone.csv",
    "parkinsons": "parkinsons.csv",
    "ReplicatedAcousticFeatures-ParkinsonDatabase": "ReplicatedAcousticFeatures-ParkinsonDatabase.csv",
    "sonar": "sonar.csv",
    "winequality-red": "winequality-red.csv",
    "winequality-white": "winequality-white.csv"
}

TARGET_COLS = {
    "BreastCanDT": "diagnosis",
    "auto-mpg": "mpg",
    "concrete_data": "concrete_compressive_strength",
    "HousingData": "MEDV",
    "ozone": "maxO3",
    "parkinsons": "status",
    "ReplicatedAcousticFeatures-ParkinsonDatabase": "Status",
    "sonar": "R",
    "winequality-red": "quality",
    "winequality-white": "quality"
}

TASKS = {
    "BreastCanDT": "classification",
    "auto-mpg": "regression",
    "concrete_data": "regression",
    "HousingData": "regression",
    "ozone": "regression",
    "parkinsons": "regression",
    "ReplicatedAcousticFeatures-ParkinsonDatabase": "classification",
    "sonar": "classification",
    "winequality-red": "regression",
    "winequality-white": "regression"
}

In [50]:
def load_dataset(file_path):
    """
    Load CSV dataset and automatically detect the separator.
    """
    try:
        df = pd.read_csv(file_path)
    except:
        df = pd.read_csv(file_path, sep=";", engine="python")
    df.columns = [c.strip().replace('"', '') for c in df.columns]
    return df

In [51]:
def prepare_regression(df, target):

    df = df.dropna(subset=[target])
    y = df[target].values.astype(float)
    X = df.drop(columns=[target])
    
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors="coerce")
    X = X.dropna(axis=1, how="all")
    
    imputer = SimpleImputer(strategy="median")
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    return X_scaled, y


In [52]:
def prepare_classification(df, target):

    df = df.dropna(subset=[target])
    y = df[target]
    X = df.drop(columns=[target])
    
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors="coerce")
    X = X.dropna(axis=1, how="all")
    
    imputer = SimpleImputer(strategy="median")
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
    else:
        y_encoded = y.values.astype(int)
    
    return X_scaled, y_encoded
