In [8]:
# Install required packages for working with Kaggle datasets
# Note: Run this in terminal: pip install kaggle opendatasets
# Or uncomment below to install programmatically:
try:
    import kaggle
    import opendatasets
except ImportError:
    import subprocess
    import sys
    print("Installing kaggle and opendatasets...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "kaggle", "opendatasets"])
    import kaggle
    import opendatasets

Installing kaggle and opendatasets...


ModuleNotFoundError: No module named 'cgi'

In [9]:
import kaggle
import os 
kaggle.api.dataset_download_files('osmi/mental-health-in-tech-survey', 
                                  path='./data', 
                                  unzip=True)

print("Dataset downloaded successfully!")

Dataset URL: https://www.kaggle.com/datasets/thedevastator/medical-student-mental-health
Dataset downloaded successfully!
Dataset downloaded successfully!


In [1]:
import pandas as pd
import glob
import os 

os.makedirs("./data", exist_ok=True)

data_files = glob.glob('./data/*')
print("Files in the data directory:")
for file in data_files:
    print(f"- {file}")

df = None
for file in data_files:
    if file.endswith('.csv'):
        try:
            print(f"\nTrying to load: {file}")
            
            if 'codebook' in file.lower():
                print("Skipping codebook file")
                continue
            elif 'data' in file.lower():
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
            else:
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
                
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
            continue

if df is not None:
    print("\nFirst 5 rows:")
    print(df.head())
else:
    print("No dataset could be loaded")

Files in the data directory:
- ./data\D1-Swiss.csv
- ./data\D1_Swiss_processed.csv
- ./data\D2-Cultural.csv
- ./data\D3-Academic.csv
- ./data\D4-Tech.csv

Trying to load: ./data\D1-Swiss.csv
Successfully loaded: ./data\D1-Swiss.csv
Shape: (886, 20)
Columns: ['id', 'age', 'year', 'sex', 'glang', 'part', 'job', 'stud_h', 'health', 'psyt', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']

Trying to load: ./data\D1_Swiss_processed.csv
Successfully loaded: ./data\D1_Swiss_processed.csv
Shape: (886, 56)
Columns: ['id', 'age', 'year', 'sex', 'glang', 'part', 'job', 'stud_h', 'health', 'PSYT_Therapy_Use', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'Depression', 'Anxiety', 'Burnout', 'Stress', 'mbi_ea', 'Source_Group', 'Gender', 'Age', 'City', 'Profession', 'Work Pressure', 'CGPA', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Family History of Mental I

In [2]:
import numpy as np
import pandas as pd
import sys
import joblib
import re
from pathlib import Path
from sklearn.preprocessing import StandardScaler

DATA_DIR = Path("./data")

# Fixed file paths to match actual filenames in data directory
FILE_PATHS = {
    "D1-Swiss": "D1-Swiss.csv",
    "D2-Cultural": "D2-Cultural.csv",
    "D3-Academic": "D3-Academic.csv",
    "D4-Tech": "D4-Tech.csv",
}

OUTPUT_PATHS = {
    "D1-Swiss": "D1_Swiss_processed.csv",
    "D2-Cultural": "D2_Cultural_processed.csv",
    "D3-Academic": "D3_Academic_processed.csv",
    "D4-Tech": "D4_Tech_processed.csv",
}

COLUMN_MAPPING = {
    "D1-Swiss": {
        "cesd": "Depression",
        "stai_t": "Anxiety",
        "mbi_ex": "Burnout",
        "mbi_cy": "Stress",
        "psyt": "PSYT_Therapy_Use",
    },
    "D2-Cultural": {
        "Do you have Depression?": "Depression",
        "Do you have Anxiety?": "Anxiety",
        "Do you have Panic attack?": "Burnout",
        "Your current year of Study": "Stress",
    },
    "D3-Academic": {
        "Depression": "Depression",
        "Academic Pressure": "Anxiety",
        "Study Satisfaction": "Burnout",
        "Financial Stress": "Stress",
    },
    "D4-Tech": {
        "mental_health_consequence": "Depression",
        "work_interfere": "Anxiety",
        "leave": "Burnout",
        "Age": "Stress",
        "treatment": "H3_Tech_Validation",
    },
}

UNIVERSAL_FEATURES = ["Depression", "Anxiety", "Burnout", "Stress"]

# Process each dataset separately with individual normalization
processed_datasets = {}
missing_sources = []

for source_name, file_name in FILE_PATHS.items():
    file_path = DATA_DIR / file_name

    if not file_path.exists():
        print(f"File not found for {source_name}: {file_path}")
        missing_sources.append(source_name)
        continue

    try:
        print(f"\n{'='*60}")
        print(f"Processing {source_name}")
        print(f"{'='*60}")
        
        df_raw = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
        print(f"Loaded {source_name} dataset with shape: {df_raw.shape}")

        current_mapping = COLUMN_MAPPING[source_name]
        missing_cols = [src for src in current_mapping if src not in df_raw.columns]
        if missing_cols:
            print(f"ERROR: Missing crucial columns in {file_name}: {missing_cols}")
            missing_sources.append(source_name)
            continue

        df_named = df_raw.rename(columns=current_mapping)

        selected_columns = UNIVERSAL_FEATURES.copy()
        if "PSYT_Therapy_Use" in current_mapping.values():
            selected_columns.append("PSYT_Therapy_Use")
        if "H3_Tech_Validation" in current_mapping.values():
            selected_columns.append("H3_Tech_Validation")

        for col in selected_columns:
            if col not in df_named.columns:
                df_named[col] = np.nan

        df_selected = df_named[selected_columns].copy()

        # Convert categorical columns to numerical
        for col in UNIVERSAL_FEATURES:
            if df_selected[col].dtype == "object":
                print(f"Converting categorical column {col} to numerical in {source_name}")
                
                # First, try to extract year numbers from strings like "year 1", "Year 1", etc.
                if col == "Stress" and source_name == "D2-Cultural":
                    # Extract year number from strings like "year 1", "Year 1", "year 2", etc.
                    # Use regex to extract the number after "year"
                    df_selected[col] = df_selected[col].apply(
                        lambda x: re.search(r'year\s*(\d+)', str(x).lower())
                    ).apply(lambda m: int(m.group(1)) if m else np.nan)
                else:
                    # Standard categorical conversion
                    df_selected[col] = df_selected[col].astype(str).str.lower().replace(
                        {
                            "yes": 1,
                            "no": 0,
                            "often": 1,
                            "rarely": 0,
                            "sometimes": 0.5,
                            "maybe": 0.5,
                            "most of the time": 1,
                            "never": 0,
                            "always": 1,
                            "not sure": 0.5,
                            "high": 1,
                            "low": 0,
                            "medium": 0.5,
                            "somewhat easy": 0.5,
                            "somewhat difficult": 0.5,
                            "very difficult": 1,
                            "very easy": 0,
                            # Year mappings (for D2-Cultural Stress column)
                            "year 1": 1,
                            "year 2": 2,
                            "year 3": 3,
                            "year 4": 4,
                        }
                    )
                    df_selected[col] = pd.to_numeric(df_selected[col], errors="coerce")

        # OUTLIER DETECTION AND REMOVAL (IQR-based, per-dataset, per-feature)
        print(f"\nOutlier Detection for {source_name}...")

        for feature in UNIVERSAL_FEATURES:
            # Work on a numeric copy of the column
            values = pd.to_numeric(df_selected[feature], errors="coerce")

            # Skip if all NaN or constant
            if values.nunique(dropna=True) < 2:
                continue

            Q1 = values.quantile(0.25)
            Q3 = values.quantile(0.75)
            IQR = Q3 - Q1

            if IQR == 0:
                continue

            # IQR bounds
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Domain-specific bounds for D4-Tech Age→Stress
            if source_name == "D4-Tech" and feature == "Stress":
                # Reasonable working range for tech workers
                lower_bound = max(lower_bound, 18)
                upper_bound = min(upper_bound, 80)

            outliers = (values < lower_bound) | (values > upper_bound)
            n_outliers = outliers.sum()

            if n_outliers > 0:
                print(f"  {feature}: Found {n_outliers} outlier(s)")
                print(f"    Bounds: {lower_bound:.2f} to {upper_bound:.2f}")
                unique_outliers = values[outliers].unique()
                for val in sorted(unique_outliers):
                    count_val = (values == val).sum()
                    print(f"    Value {val}: {count_val} row(s)")
                print(f"    → Removing {n_outliers} outlier row(s)")

                # Apply mask back to df_selected
                df_selected = df_selected[~outliers].reset_index(drop=True)

        # Handle missing values (fill with mean of THIS dataset)
        print(f"\nData Cleaning: Handling missing values for {source_name}")
        for feature in UNIVERSAL_FEATURES:
            mean_value = df_selected[feature].mean(skipna=True)
            missing_count = df_selected[feature].isna().sum()
            if missing_count > 0:
                print(f"  {feature}: Filling {missing_count} missing values with mean={mean_value:.4f}")
            df_selected[feature] = df_selected[feature].fillna(mean_value)

        # NORMALIZE EACH DATASET SEPARATELY (this will ensure the prevention of the Stress collapse issue)
        print(f"\nNormalizing {source_name} features using Z-score normalization (per-dataset)")
        scaler = StandardScaler()
        df_selected[UNIVERSAL_FEATURES] = scaler.fit_transform(df_selected[UNIVERSAL_FEATURES])
        
        # Save scaler for this dataset
        scaler_path = f"{source_name.replace('-', '_')}_scaler.joblib"
        joblib.dump(scaler, scaler_path)
        print(f"  Scaler saved as '{scaler_path}'")
        
        # Print normalization stats
        print(f"  Normalized feature statistics:")
        for feat in UNIVERSAL_FEATURES:
            mean_val = df_selected[feat].mean()
            std_val = df_selected[feat].std()
            print(f"    {feat}: mean={mean_val:.6f}, std={std_val:.6f}")
        
        df_selected["Source_Group"] = source_name
        processed_datasets[source_name] = df_selected
        
        # Save individual processed dataset
        output_file = OUTPUT_PATHS[source_name]
        df_selected.to_csv(output_file, index=False)
        print(f"   {source_name} processed dataset saved as '{output_file}'")

    except Exception as exc:
        print(f"Error loading {source_name} dataset: {exc}")
        import traceback
        traceback.print_exc()
        missing_sources.append(source_name)

if not processed_datasets:
    print("\nNo datasets were processed. Exiting.")
    sys.exit(1)

if missing_sources:
    print("\n⚠ Datasets with missing files or errors:")
    for name in missing_sources:
        print(f"  - {name}")

# Combine all processed datasets (already normalized separately)
print(f"\n{'='*60}")
print("Combining all processed datasets")
print(f"{'='*60}")
df_combined = pd.concat(list(processed_datasets.values()), ignore_index=True)
print(f"Combined dataset shape: {df_combined.shape}")

# Save fused dataset
fused_output = "fused_mental_health_dataset.csv"
df_combined.to_csv(fused_output, index=False)
print(f"\n✓ Fused dataset saved as '{fused_output}'")

print(f"\n{'='*60}")
print("Data processing complete!")
print(f"{'='*60}")
print(f"\nProcessed datasets:")
for source_name in processed_datasets.keys():
    print(f"  ✓ {source_name}: {OUTPUT_PATHS[source_name]}")
print(f"\nNote: Each dataset was normalized separately to preserve feature distributions.")



Processing D1-Swiss
Loaded D1-Swiss dataset with shape: (886, 20)

Outlier Detection for D1-Swiss...
  Depression: Found 8 outlier(s)
    Bounds: -15.00 to 49.00
    Value 50: 1 row(s)
    Value 51: 2 row(s)
    Value 52: 1 row(s)
    Value 53: 1 row(s)
    Value 54: 2 row(s)
    Value 56: 1 row(s)
    → Removing 8 outlier row(s)
  Anxiety: Found 4 outlier(s)
    Bounds: 8.88 to 75.88
    Value 76: 3 row(s)
    Value 77: 1 row(s)
    → Removing 4 outlier row(s)
  Stress: Found 3 outlier(s)
    Bounds: -4.50 to 23.50
    Value 24: 3 row(s)
    → Removing 3 outlier row(s)

Data Cleaning: Handling missing values for D1-Swiss

Normalizing D1-Swiss features using Z-score normalization (per-dataset)
  Scaler saved as 'D1_Swiss_scaler.joblib'
  Normalized feature statistics:
    Depression: mean=0.000000, std=1.000575
    Anxiety: mean=-0.000000, std=1.000575
    Burnout: mean=0.000000, std=1.000575
    Stress: mean=-0.000000, std=1.000575
   D1-Swiss processed dataset saved as 'D1_Swiss_pro

  df_selected[col] = df_selected[col].astype(str).str.lower().replace(
  df_selected[col] = df_selected[col].astype(str).str.lower().replace(
  df_selected[col] = df_selected[col].astype(str).str.lower().replace(


Converting categorical column Stress to numerical in D3-Academic

Outlier Detection for D3-Academic...

Data Cleaning: Handling missing values for D3-Academic
  Stress: Filling 3 missing values with mean=3.1399

Normalizing D3-Academic features using Z-score normalization (per-dataset)
  Scaler saved as 'D3_Academic_scaler.joblib'
  Normalized feature statistics:
    Depression: mean=0.000000, std=1.000018
    Anxiety: mean=0.000000, std=1.000018
    Burnout: mean=0.000000, std=1.000018
    Stress: mean=-0.000000, std=1.000018
   D3-Academic processed dataset saved as 'D3_Academic_processed.csv'

Processing D4-Tech
Loaded D4-Tech dataset with shape: (1259, 27)
Converting categorical column Depression to numerical in D4-Tech
Converting categorical column Anxiety to numerical in D4-Tech
Converting categorical column Burnout to numerical in D4-Tech

Outlier Detection for D4-Tech...
  Stress: Found 40 outlier(s)
    Bounds: 18.00 to 49.50
    Value -1726: 1 row(s)
    Value -29: 1 row(s)
 

  df_selected[col] = df_selected[col].astype(str).str.lower().replace(



✓ Fused dataset saved as 'fused_mental_health_dataset.csv'

Data processing complete!

Processed datasets:
  ✓ D1-Swiss: D1_Swiss_processed.csv
  ✓ D2-Cultural: D2_Cultural_processed.csv
  ✓ D3-Academic: D3_Academic_processed.csv
  ✓ D4-Tech: D4_Tech_processed.csv

Note: Each dataset was normalized separately to preserve feature distributions.


In [2]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found
