In [1]:
import os

# V√©rifier que le dossier data/ existe
data_folder = 'Data'

print("Fichiers dans le dossier 'Data/' :")
print("="*50)

if os.path.exists(data_folder):
    files = os.listdir(data_folder)
    for i, file in enumerate(files, 1):
        print(f"{i}. {file}")
else:
    print("Le dossier 'Data/' n'existe pas !")

Fichiers dans le dossier 'Data/' :
1. .ipynb_checkpoints
2. auto-mpg.csv
3. BreastCanDT.csv
4. concrete_data.csv
5. dataset_scenario1.csv
6. dataset_scenario2.csv
7. dataset_scenario3.csv
8. dataset_scenario4.csv
9. HousingData.csv
10. ozone.csv
11. parkinsons.csv
12. ReplicatedAcousticFeatures-ParkinsonDatabase.csv
13. RLT_PROJECT.ipynb
14. sonar.csv
15. winequality-red.csv
16. winequality-white.csv


In [7]:
"""
============================================================================
AUGMENTATION DES FEATURES √Ä 500
============================================================================
Ce script charge chaque dataset et augmente le nombre de features √† 500
en cr√©ant des features synth√©tiques (signal + bruit, ratio 1:2)
============================================================================
"""

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

# ============================================================================
# CONFIGURATION
# ============================================================================

TARGET_P = 500  # Nombre final de features
SEED = 42
SIGNAL_NOISE_RATIO = 0.33  # Ratio 1:2 (1 part signal, 2 parts bruit)
DATA_FOLDER = 'Data'
OUTPUT_FOLDER = 'datasets_augmented'

# Configuration des datasets
DATASETS_CONFIG = [
    {'name': 'HousingData', 'filepath': 'Data/HousingData.csv', 'target_col': 'MEDV', 'sep': ','},
    {'name': 'BreastCanDT', 'filepath': 'Data/BreastCanDT.csv', 'target_col': 'diagnosis', 'sep': ','},
    {'name': 'parkinsons', 'filepath': 'data/parkinsons.csv', 'target_col': 'status', 'sep': ','},
    {'name': 'sonar', 'filepath': 'data/sonar.csv', 'target_col': 'R', 'sep': ','},
    {'name': 'winequality-white', 'filepath': 'Data/winequality-white.csv', 'target_col': 'quality', 'sep': ';'},
    {'name': 'winequality-red', 'filepath': 'Data/winequality-red.csv', 'target_col': 'quality', 'sep': ','},
    {'name': 'ReplicatedAcousticFeatures-ParkinsonDatabase', 'filepath': 'Data/ReplicatedAcousticFeatures-ParkinsonDatabase.csv', 'target_col': 'Status', 'sep': ','},
    {'name': 'ozone', 'filepath': 'Data/ozone.csv', 'target_col': 'maxO3', 'sep': ','},
    {'name': 'concrete_data', 'filepath': 'Data/concrete_data.csv', 'target_col': 'concrete_compressive_strength', 'sep': ','},
    {'name': 'auto_mpg', 'filepath': 'Data/auto-mpg.csv', 'target_col': 'mpg', 'sep': ','}
]

# ============================================================================
# FONCTION : AUGMENTER FEATURES
# ============================================================================

def augment_features(X, target_p=TARGET_P, seed=SEED):
    """
    Augmente le nombre de features de p_original √† target_p
    
    Pour chaque nouvelle feature :
    - S√©lectionne une feature originale au hasard
    - G√©n√®re du bruit al√©atoire
    - Combine : nouvelle_feature = 1/3 * signal + 2/3 * bruit
    
    Parameters:
    -----------
    X : array (n_samples, p_original)
    target_p : int (500)
    
    Returns:
    --------
    X_augmented : array (n_samples, 500)
    """
    np.random.seed(seed)
    
    n_samples, p_original = X.shape
    n_new = target_p - p_original
    
    if n_new <= 0:
        print(f"      ‚ö†Ô∏è  D√©j√† {p_original} features")
        return X
    
    print(f"      Augmentation : {p_original} ‚Üí {target_p} features")
    
    # Cr√©er matrice pour nouvelles features
    X_new = np.zeros((n_samples, n_new))
    
    for i in range(n_new):
        # S√©lectionner une feature originale au hasard
        idx = np.random.randint(0, p_original)
        signal = X[:, idx]
        
        # G√©n√©rer bruit (m√™me √©chelle que le signal)
        noise_scale = np.std(signal) if np.std(signal) > 0 else 1.0
        noise = np.random.normal(0, noise_scale, n_samples)
        
        # Combiner (ratio 1:2)
        X_new[:, i] = SIGNAL_NOISE_RATIO * signal + (1 - SIGNAL_NOISE_RATIO) * noise
    
    # Coller nouvelles features aux originales
    X_augmented = np.hstack([X, X_new])
    
    return X_augmented

# ============================================================================
# FONCTION : TRAITER UN DATASET
# ============================================================================

def process_dataset(config):
    """
    Charge un dataset et augmente ses features √† 500
    """
    print(f"\n{'='*70}")
    print(f"üìä {config['name']}")
    print(f"{'='*70}")
    
    try:
        # Charger le CSV
        filepath = config['filepath']
        df = pd.read_csv(filepath, sep=config['sep'])
        
        print(f"   Shape originale : {df.shape}")
        
        # S√©parer X et y
        target_col = config['target_col']
        y = df[target_col]
        X_df = df.drop(columns=[target_col])
        
        # Garder SEULEMENT les colonnes num√©riques pour X
        X_df = X_df.select_dtypes(include=[np.number])
        original_column_names = X_df.columns.tolist()
        X = X_df.values
        
        # Encoder y si texte et sauvegarder le mapping
        mapping = None
        if y.dtype == 'object':
            le = LabelEncoder()
            y_encoded = le.fit_transform(y)
            # Cr√©er mapping : {0: "B", 1: "M", ...}
            mapping = {int(i): str(label) for i, label in enumerate(le.classes_)}
            print(f"   Target encode : {mapping}")
            y = y_encoded
        else:
            y = y.values
        
        # Imputation intelligente des valeurs manquantes
        if pd.isna(X).any():
            print(f"   Valeurs manquantes detectees")
            X_df = pd.DataFrame(X)
            
            for col_idx in range(X_df.shape[1]):
                col = X_df.iloc[:, col_idx]
    
            if col.isna().any():
                # V√©rifier si colonne binaire (seulement 0 et 1)
                unique_vals = col.dropna().unique()
                is_binary = set(unique_vals).issubset({0, 1, 0.0, 1.0})
        
            if is_binary:
                # Imputation par mode (valeur la plus fr√©quente)
                mode_val = col.mode()[0]
                X_df.iloc[:, col_idx] = X_df.iloc[:, col_idx].fillna(mode_val)
                print(f"      Colonne {col_idx} (binaire) : mode = {mode_val}")
            else:
                # Imputation par m√©diane pour colonnes num√©riques
                median_val = col.median()
                X_df.iloc[:, col_idx] = X_df.iloc[:, col_idx].fillna(median_val)
                print(f"      Colonne {col_idx} (numerique) : mediane = {median_val:.2f}")

            X = X_df.values

        
        X = X.astype(float)
        y = y.astype(float)
        
        print(f"   X : {X.shape}, y : {y.shape}")
        
        # AUGMENTER LES FEATURES
        X_augmented = augment_features(X, target_p=TARGET_P)
        
        # Sauvegarder
        os.makedirs(OUTPUT_FOLDER, exist_ok=True)
        
        # Cr√©er les noms de colonnes
        n_original = len(original_column_names)
        n_synthetic = X_augmented.shape[1] - n_original
        
        # Noms : [vraies features] + [synthetic_0, synthetic_1, ...]
        column_names = original_column_names + [f'synthetic_{i}' for i in range(n_synthetic)]
        
        output_df = pd.DataFrame(X_augmented, columns=column_names)
        output_df[target_col] = y
        
        output_path = os.path.join(OUTPUT_FOLDER, f"{config['name']}.csv")
        output_df.to_csv(output_path, index=False)
        
        print(f"   ‚úÖ Sauvegard√© : {output_path}")
        
        return {
            'name': config['name'], 
            'status': 'success', 
            'shape': X_augmented.shape,
            'mapping': mapping  # Peut √™tre None si pas d'encoding
        }
        
    except Exception as e:
        print(f"   ‚ùå ERREUR : {e}")
        return {
            'name': config['name'], 
            'status': 'failed', 
            'error': str(e)
        }

# ============================================================================
# EX√âCUTION
# ============================================================================

if __name__ == "__main__":
    print("="*70)
    print("AUGMENTATION DES FEATURES √Ä 500")
    print("="*70)
    
    results = []
    
    for config in DATASETS_CONFIG:
        result = process_dataset(config)
        results.append(result)
    
    # R√©sum√©
    print("\n" + "="*70)
    print("R√âSUM√â")
    print("="*70)
    
    success = [r for r in results if r['status'] == 'success']
    failed = [r for r in results if r['status'] == 'failed']
    
    print(f"\n‚úÖ R√©ussis : {len(success)}/{len(results)}")
    
    for r in success:
        print(f"   {r['name']:<50s} ‚Üí {r['shape']}")
    
    if failed:
        print(f"\n‚ùå √âchou√©s : {len(failed)}")
        for r in failed:
            print(f"   {r['name']} : {r['error']}")
    
    # Sauvegarder les mappings dans un fichier JSON
    mappings = {}
    for r in success:
        if r.get('mapping'):  # Si le dataset avait un target encod√©
            mappings[r['name']] = r['mapping']
    
    if mappings:
        import json
        mapping_path = os.path.join(OUTPUT_FOLDER, 'target_mappings.json')
        with open(mapping_path, 'w', encoding='utf-8') as f:
            json.dump(mappings, f, indent=2, ensure_ascii=False)
        
        print(f"\nüìã Mappings des targets encod√©s : {mapping_path}")
        print("\n   Exemple de contenu :")
        for name, mapping in list(mappings.items())[:3]:
            print(f"   ‚Ä¢ {name}: {mapping}")
    
    print(f"\nüíæ Datasets sauvegard√©s dans : {OUTPUT_FOLDER}/")
    print("\n‚úÖ TERMIN√â !")

AUGMENTATION DES FEATURES √Ä 500

üìä HousingData
   Shape originale : (506, 14)
   Valeurs manquantes detectees
      Colonne 0 (numerique) : mediane = 0.25
      Colonne 1 (numerique) : mediane = 0.00
      Colonne 2 (numerique) : mediane = 9.69
      Colonne 3 (binaire) : mode = 0.0
      Colonne 6 (numerique) : mediane = 76.80
      Colonne 12 (numerique) : mediane = 11.43
   X : (506, 13), y : (506,)
      Augmentation : 13 ‚Üí 500 features


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_df.iloc[:, col_idx].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_df.iloc[:, col_idx].fillna(mode_val, inplace=True)


   ‚úÖ Sauvegard√© : datasets_augmented\HousingData.csv

üìä BreastCanDT
   Shape originale : (569, 33)
   Target encode : {0: 'B', 1: 'M'}
   Valeurs manquantes detectees
   ‚ùå ERREUR : 0

üìä parkinsons
   Shape originale : (195, 24)
   X : (195, 22), y : (195,)
      Augmentation : 22 ‚Üí 500 features
   ‚úÖ Sauvegard√© : datasets_augmented\parkinsons.csv

üìä sonar
   Shape originale : (207, 61)
   Target encode : {0: 'M', 1: 'R'}
   X : (207, 60), y : (207,)
      Augmentation : 60 ‚Üí 500 features
   ‚úÖ Sauvegard√© : datasets_augmented\sonar.csv

üìä winequality-white
   Shape originale : (4898, 12)
   X : (4898, 11), y : (4898,)
      Augmentation : 11 ‚Üí 500 features
   ‚úÖ Sauvegard√© : datasets_augmented\winequality-white.csv

üìä winequality-red
   Shape originale : (1599, 12)
   X : (1599, 11), y : (1599,)
      Augmentation : 11 ‚Üí 500 features
   ‚úÖ Sauvegard√© : datasets_augmented\winequality-red.csv

üìä ReplicatedAcousticFeatures-ParkinsonDatabase
   Shape o