In [2]:
"""
GENERATION DES 4 SCENARIOS DE SIMULATION
Section 4.2 - Simulation Scenarios

Generation des datasets pour chaque scenario avec p = 200, 500, 1000
Sauvegarde en fichiers CSV (sans splits train/test)
"""

import numpy as np
from scipy.stats import norm
import pandas as pd
import os

OUTPUT_FOLDER = 'datasets_augmented'
SEED = 42

def generate_scenario_1(N, p, seed=None):
    """
    Scenario 1 : Classification avec covariables independantes
    
    N = 100
    X ~ Uniform[0,1]^p
    mu = Phi(10*(X1 - 1) + 20*|X2 - 0.5|)
    Y ~ Bernoulli(mu)
    
    Variables importantes : X1, X2
    """
    if seed is not None:
        np.random.seed(seed)
    
    X = np.random.uniform(0, 1, size=(N, p))
    mu = norm.cdf(10 * (X[:, 0] - 1) + 20 * np.abs(X[:, 1] - 0.5))
    Y = np.random.binomial(1, mu)
    
    return X, Y

def generate_scenario_2(N, p, seed=None):
    """
    Scenario 2 : Modele non-lineaire avec covariables independantes
    
    N = 100
    X ~ Uniform[0,1]^p
    Y = 100(X1 - 0.5)^2 * (X2 - 0.25)+ + epsilon
    epsilon ~ N(0,1)
    
    Variables importantes : X1, X2
    """
    if seed is not None:
        np.random.seed(seed)
    
    X = np.random.uniform(0, 1, size=(N, p))
    Y = 100 * (X[:, 0] - 0.5)**2 * np.maximum(X[:, 1] - 0.25, 0)
    Y += np.random.normal(0, 1, N)
    
    return X, Y

def generate_scenario_3(N, p, seed=None):
    """
    Scenario 3 : Modele checkerboard avec forte correlation
    
    N = 300
    X ~ N(0, Sigma) ou Sigma_ij = 0.9^|i-j|
    Y = 2*X50*X100 + 2*X150*X200 + epsilon
    epsilon ~ N(0,1)
    
    Variables importantes : X50, X100, X150, X200
    """
    if seed is not None:
        np.random.seed(seed)
    
    Sigma = np.zeros((p, p))
    for i in range(p):
        for j in range(p):
            Sigma[i, j] = 0.9 ** abs(i - j)
    
    X = np.random.multivariate_normal(np.zeros(p), Sigma, size=N)
    Y = 2 * X[:, 49] * X[:, 99] + 2 * X[:, 149] * X[:, 199]
    Y += np.random.normal(0, 1, N)
    
    return X, Y

def generate_scenario_4(N, p, seed=None):
    """
    Scenario 4 : Modele lineaire
    
    N = 200
    X ~ N(0, Sigma) ou Sigma_ij = 0.5^|i-j| + 0.2*I(i,j)
    Y = 2*X50 + 2*X100 + 4*X150 + epsilon
    epsilon ~ N(0,1)
    
    Variables importantes : X50, X100, X150
    """
    if seed is not None:
        np.random.seed(seed)
    
    Sigma = np.zeros((p, p))
    for i in range(p):
        for j in range(p):
            Sigma[i, j] = 0.5 ** abs(i - j)
            if i == j:
                Sigma[i, j] += 0.2
    
    X = np.random.multivariate_normal(np.zeros(p), Sigma, size=N)
    Y = 2 * X[:, 49] + 2 * X[:, 99] + 4 * X[:, 149]
    Y += np.random.normal(0, 1, N)
    
    return X, Y

def generate_and_save_scenario(scenario_num, scenario_func, N, p_values):
    """
    Genere et sauvegarde le dataset complet pour un scenario
    """
    print(f"\n{'='*70}")
    print(f"SCENARIO {scenario_num}")
    print(f"{'='*70}")
    
    for p in p_values:
        print(f"\n  p = {p}")
        
        # Generer dataset
        X, Y = scenario_func(N=N, p=p, seed=SEED)
        print(f"    Dataset: X shape = {X.shape}, Y shape = {Y.shape}")
        
        # Creer DataFrame
        columns = [f"X{i+1}" for i in range(p)]
        df = pd.DataFrame(X, columns=columns)
        df['Y'] = Y
        
        # Sauvegarder
        os.makedirs(OUTPUT_FOLDER, exist_ok=True)
        file_path = os.path.join(OUTPUT_FOLDER, f"scenario{scenario_num}_p{p}.csv")
        
        df.to_csv(file_path, index=False)
        
        print(f"    Sauvegarde: {file_path}")

if __name__ == "__main__":
    print("="*70)
    print("GENERATION DES DATASETS DES 4 SCENARIOS")
    print("="*70)
    
    p_values = [200, 500, 1000]
    
    # Scenario 1
    generate_and_save_scenario(
        scenario_num=1,
        scenario_func=generate_scenario_1,
        N=100,
        p_values=p_values
    )
    
    # Scenario 2
    generate_and_save_scenario(
        scenario_num=2,
        scenario_func=generate_scenario_2,
        N=100,
        p_values=p_values
    )
    
    # Scenario 3
    generate_and_save_scenario(
        scenario_num=3,
        scenario_func=generate_scenario_3,
        N=300,
        p_values=p_values
    )
    
    # Scenario 4
    generate_and_save_scenario(
        scenario_num=4,
        scenario_func=generate_scenario_4,
        N=200,
        p_values=p_values
    )
    
    print("\n" + "="*70)
    print("RESUME")
    print("="*70)
    print(f"\nDossier de sortie: {OUTPUT_FOLDER}/")
    print("\nFichiers generes:")
    
    for scenario in range(1, 5):
        for p in p_values:
            print(f"  - scenario{scenario}_p{p}.csv")
    
    print("\nTotal: 12 fichiers (4 scenarios x 3 dimensions)")
    print("\nTERMINE")

GENERATION DES DATASETS DES 4 SCENARIOS

SCENARIO 1

  p = 200
    Dataset: X shape = (100, 200), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario1_p200.csv

  p = 500
    Dataset: X shape = (100, 500), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario1_p500.csv

  p = 1000
    Dataset: X shape = (100, 1000), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario1_p1000.csv

SCENARIO 2

  p = 200
    Dataset: X shape = (100, 200), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario2_p200.csv

  p = 500
    Dataset: X shape = (100, 500), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario2_p500.csv

  p = 1000
    Dataset: X shape = (100, 1000), Y shape = (100,)
    Sauvegarde: datasets_augmented\scenario2_p1000.csv

SCENARIO 3

  p = 200
    Dataset: X shape = (300, 200), Y shape = (300,)
    Sauvegarde: datasets_augmented\scenario3_p200.csv

  p = 500
    Dataset: X shape = (300, 500), Y shape = (300,)
    Sauvegarde: datasets_augmented\s