In [None]:
# Simple Dataset Balance Check
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
FEATURE_COLUMNS = ["Depression", "Anxiety", "Stress", "Burnout"]
DATASETS = {
    "D1-Swiss": Path("D1_Swiss_processed.csv"),
    "D2-Cultural": Path("D2_Cultural_processed.csv"),
    "D3-Academic": Path("D3_Academic_processed.csv"),
    "D4-Tech": Path("D4_Tech_processed.csv"),
}


In [None]:
# Check train/test balance for each dataset
for dataset_name, dataset_path in DATASETS.items():
    print(f"\n{dataset_name}:")
    print("-" * 60)
    
    df = pd.read_csv(dataset_path)
    feature_matrix = df[FEATURE_COLUMNS].values
    
    # Train/test split (matching Autoencoder.ipynb)
    train_val_data, test_data = train_test_split(
        feature_matrix, 
        test_size=0.2, 
        random_state=RANDOM_SEED
    )
    
    train_df = pd.DataFrame(train_val_data, columns=FEATURE_COLUMNS)
    test_df = pd.DataFrame(test_data, columns=FEATURE_COLUMNS)
    
    print(f"Size: {len(feature_matrix)} total | Train: {len(train_val_data)} (80%) | Test: {len(test_data)} (20%)")
    
    # Check if train/test distributions are similar
    print("\nTrain/Test Balance Check:")
    for col in FEATURE_COLUMNS:
        train_mean = train_df[col].mean()
        test_mean = test_df[col].mean()
        abs_diff = abs(train_mean - test_mean)
        
        # Use absolute difference when mean is near zero, otherwise use percentage
        if abs(train_mean) < 0.1:
            # When mean is near zero, use absolute difference
            status = "✓" if abs_diff < 0.05 else "⚠"
            print(f"  {col}: {status} Abs diff = {abs_diff:.4f} (Train: {train_mean:.4f}, Test: {test_mean:.4f})")
        else:
            # When mean is not near zero, use percentage
            mean_diff_pct = abs((train_mean - test_mean) / train_mean * 100)
            status = "✓" if mean_diff_pct < 5 else "⚠"
            print(f"  {col}: {status} Mean diff = {mean_diff_pct:.2f}% | Abs diff = {abs_diff:.4f} (Train: {train_mean:.3f}, Test: {test_mean:.3f})")


In [None]:
# Pragmatic Grid Search: Find optimal bin number for stratified splits
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

FEATURE_COLUMNS = ["Depression", "Anxiety", "Stress", "Burnout"]
DATASETS = {
    "D1-Swiss": Path("D1_Swiss_processed.csv"),
    "D2-Cultural": Path("D2_Cultural_processed.csv"),
    "D3-Academic": Path("D3_Academic_processed.csv"),
    "D4-Tech": Path("D4_Tech_processed.csv"),
}
MAX_BINS_TO_TRY = 6

print("="*80)
print("GRID SEARCH: Finding Optimal Bin Number for Stratified Splits")
print("="*80)
print(f"Testing bins from 2 to {MAX_BINS_TO_TRY}")
print("="*80)

all_results = {}

for dataset_name, dataset_path in DATASETS.items():
    print(f"\n{'='*80}")
    print(f"{dataset_name}")
    print(f"{'='*80}")

    df = pd.read_csv(dataset_path)
    feature_matrix = df[FEATURE_COLUMNS].values
    dataset_size = len(feature_matrix)

    print(f"Dataset size: {dataset_size}")
    best_bins = None
    best_imbalance = float('inf')
    best_results = None
    results_list = []
    
    for n_bins in range(2, MAX_BINS_TO_TRY + 1):
        try:
            df_binned = df.copy()
            for col in FEATURE_COLUMNS:
                df_binned[f'{col}_bin'] = pd.qcut(
                    df[col], 
                    q=n_bins, 
                    labels=False, 
                    duplicates='drop'
                )
            # Create stratification label
            df_binned['stratify_label'] = df_binned[[f'{col}_bin' for col in FEATURE_COLUMNS]].apply(
                lambda x: '_'.join(x.astype(str)), axis=1
            )

            # Check if stratification is possible
            stratum_counts = df_binned['stratify_label'].value_counts()
            min_stratum_size = stratum_counts.min()

            if min_stratum_size < 2:
                print(f"  {n_bins} bins: ✗ Failed (min stratum size = {min_stratum_size})")
                results_list.append({
                    'bins': n_bins,
                    'status': 'Failed',
                    'reason': f'Min stratum size = {min_stratum_size}'
                })
                continue

            # Perform stratified split
            train_val_data, test_data = train_test_split(
                feature_matrix, 
                test_size=0.2, 
                random_state=RANDOM_SEED,
                stratify=df_binned['stratify_label']
            )

            train_df = pd.DataFrame(train_val_data, columns=FEATURE_COLUMNS)
            test_df = pd.DataFrame(test_data, columns=FEATURE_COLUMNS)

            total_imbalance = 0
            max_imbalance = 0
            feature_imbalances = []

            for col in FEATURE_COLUMNS:
                train_mean = train_df[col].mean()
                test_mean = test_df[col].mean()
                abs_diff = abs(train_mean - test_mean)
                
                if abs(train_mean) < 0.1:
                    normalized_diff = abs_diff
                    metric = "abs"
                else:
                    normalized_diff = abs((train_mean - test_mean) / train_mean * 100)
                    metric = "pct"
                
                feature_imbalances.append({
                    'feature': col,
                    'train_mean': train_mean,
                    'test_mean': test_mean,
                    'diff': normalized_diff,
                    'metric': metric
                })
                
                total_imbalance += normalized_diff
                max_imbalance = max(max_imbalance, normalized_diff)
            
            avg_imbalance = total_imbalance / len(FEATURE_COLUMNS)
            
            print(f"  {n_bins} bins: ✓ Total imbalance = {total_imbalance:.2f} | "
                  f"Max = {max_imbalance:.2f} | Avg = {avg_imbalance:.2f}")
            
            results_list.append({
                'bins': n_bins,
                'status': 'Success',
                'total_imbalance': total_imbalance,
                'avg_imbalance': avg_imbalance,
                'max_imbalance': max_imbalance,
                'feature_imbalances': feature_imbalances
            })
            
            # Track best result
            if total_imbalance < best_imbalance:
                best_imbalance = total_imbalance
                best_bins = n_bins
                best_results = {
                    'train_df': train_df,
                    'test_df': test_df,
                    'feature_imbalances': feature_imbalances,
                    'total_imbalance': total_imbalance,
                    'avg_imbalance': avg_imbalance,
                    'max_imbalance': max_imbalance
                }
                
        except (ValueError, KeyError) as e:
            error_msg = str(e)
            if "least populated class" in error_msg:
                reason = "Some strata have < 2 samples"
            else:
                reason = error_msg[:40]
            
            print(f"  {n_bins} bins: ✗ Failed ({reason})")
            results_list.append({
                'bins': n_bins,
                'status': 'Failed',
                'reason': reason
            })
            continue
    
    # Summary for this dataset
    print(f"\n  Summary:")
    if best_bins is not None:
        print(f"  ✓ Best: {best_bins} bins (Total imbalance = {best_imbalance:.2f})")
        print(f"\n  Best configuration details ({best_bins} bins):")
        for fi in best_results['feature_imbalances']:
            if fi['metric'] == 'abs':
                status = "✓" if fi['diff'] < 0.05 else "⚠"
                print(f"    {fi['feature']}: {status} {fi['diff']:.4f} abs diff "
                      f"(Train: {fi['train_mean']:.4f}, Test: {fi['test_mean']:.4f})")
            else:
                status = "✓" if fi['diff'] < 5 else "⚠"
                print(f"    {fi['feature']}: {status} {fi['diff']:.2f}% diff "
                      f"(Train: {fi['train_mean']:.3f}, Test: {fi['test_mean']:.3f})")
    else:
        print(f"  ✗ No successful configuration found")
    
    # Results table
    print(f"\n  All results:")
    print(f"  {'Bins':<6} {'Status':<10} {'Total Imbalance':<15} {'Avg Imbalance':<15} {'Max Imbalance':<15}")
    print(f"  {'-'*6} {'-'*10} {'-'*15} {'-'*15} {'-'*15}")
    for r in results_list:
        if r['status'] == 'Success':
            print(f"  {r['bins']:<6} {r['status']:<10} {r['total_imbalance']:<15.2f} "
                  f"{r['avg_imbalance']:<15.2f} {r['max_imbalance']:<15.2f}")
        else:
            print(f"  {r['bins']:<6} {r['status']:<10} {'N/A':<15} {'N/A':<15} {'N/A':<15}")
    
    all_results[dataset_name] = {
        'best_bins': best_bins,
        'best_imbalance': best_imbalance,
        'best_results': best_results,
        'all_results': results_list
    }

print(f"\n{'='*80}")
print("GRID SEARCH COMPLETE")
print(f"{'='*80}")
print("\nRecommendations:")
for dataset_name, result in all_results.items():
    if result['best_bins']:
        print(f"  {dataset_name}: Use {result['best_bins']} bins "
              f"(Imbalance: {result['best_imbalance']:.2f})")
    else:
        print(f"  {dataset_name}: No working configuration found")
print("="*80)
