In [2]:
# Install required packages for working with Kaggle datasets
%pip install kaggle opendatasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import kaggle
import os 
kaggle.api.dataset_download_files('osmi/mental-health-in-tech-survey', 
                                  path='./data', 
                                  unzip=True)

print("Dataset downloaded successfully!")

Dataset URL: https://www.kaggle.com/datasets/thedevastator/medical-student-mental-health
Dataset downloaded successfully!
Dataset downloaded successfully!


In [11]:
import pandas as pd
import glob
import os 

os.makedirs("./data", exist_ok=True)

data_files = glob.glob('./data/*')
print("Files in the data directory:")
for file in data_files:
    print(f"- {file}")

df = None
for file in data_files:
    if file.endswith('.csv'):
        try:
            print(f"\nTrying to load: {file}")
            
            if 'codebook' in file.lower():
                print("Skipping codebook file")
                continue
            elif 'data' in file.lower():
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
            else:
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
                
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
            continue

if df is not None:
    print("\nFirst 5 rows:")
    print(df.head())
else:
    print("No dataset could be loaded")

Files in the data directory:
- ./data\Codebook Carrard et al. 2022 MedTeach.csv
- ./data\Data Carrard et al. 2022 MedTeach.csv

Trying to load: ./data\Codebook Carrard et al. 2022 MedTeach.csv
Skipping codebook file

Trying to load: ./data\Data Carrard et al. 2022 MedTeach.csv
Successfully loaded: ./data\Data Carrard et al. 2022 MedTeach.csv
Shape: (886, 20)
Columns: ['id', 'age', 'year', 'sex', 'glang', 'part', 'job', 'stud_h', 'health', 'psyt', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']

First 5 rows:
   id  age  year  sex  glang  part  job  stud_h  health  psyt  jspe  qcae_cog  \
0   2   18     1    1    120     1    0      56       3     0    88        62   
1   4   26     4    1      1     1    0      20       4     0   109        55   
2   9   21     3    2      1     0    0      36       3     0   106        64   
3  10   21     2    2      1     0    1      51       5     0   101        52   
4  13   21     3    1      1

In [1]:
import numpy as np
import pandas as pd
import sys
import joblib
from pathlib import Path
from sklearn.preprocessing import StandardScaler

DATA_DIR = Path("./data")

FILE_PATHS = {
    "D1-Swiss": "Data Carrard et al. 2022 MedTeach.csv",
    "D2-Cultural": "d2_Malaysian.csv",
    "D3-Academic": "d3_Indian.csv",
    "D4-Tech": "d4_tech_workers.csv",
}

OUTPUT_PATHS = {
    "D1-Swiss": "D1_Swiss_processed.csv",
    "D2-Cultural": "D2_Cultural_processed.csv",
    "D3-Academic": "D3_Academic_processed.csv",
    "D4-Tech": "D4_Tech_processed.csv",
}

COLUMN_MAPPING = {
    "D1-Swiss": {
        "cesd": "Depression",
        "stai_t": "Anxiety",
        "mbi_ex": "Burnout",
        "mbi_cy": "Stress",
        "psyt": "PSYT_Therapy_Use",
    },
    "D2-Cultural": {
        "Do you have Depression?": "Depression",
        "Do you have Anxiety?": "Anxiety",
        "Do you have Panic attack?": "Burnout",
        "Your current year of Study": "Stress",
    },
    "D3-Academic": {
        "Depression": "Depression",
        "Academic Pressure": "Anxiety",
        "Study Satisfaction": "Burnout",
        "Financial Stress": "Stress",
    },
    "D4-Tech": {
        "mental_health_consequence": "Depression",
        "work_interfere": "Anxiety",
        "leave": "Burnout",
        "Age": "Stress",
        "treatment": "H3_Tech_Validation",
    },
}

UNIVERSAL_FEATURES = ["Depression", "Anxiety", "Burnout", "Stress"]

all_data_frames = []
missing_sources = []

for source_name, file_name in FILE_PATHS.items():
    file_path = DATA_DIR / file_name

    if not file_path.exists():
        print(f"File not found for {source_name}: {file_path}")
        missing_sources.append(source_name)
        continue

    try:
        df_raw = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
        print(f"Loaded {source_name} dataset with shape: {df_raw.shape}")

        current_mapping = COLUMN_MAPPING[source_name]
        missing_cols = [src for src in current_mapping if src not in df_raw.columns]
        if missing_cols:
            print(f"ERROR: Missing crucial columns in {file_name}: {missing_cols}")
            missing_sources.append(source_name)
            continue

        df_named = df_raw.rename(columns=current_mapping)

        selected_columns = UNIVERSAL_FEATURES.copy()
        if "PSYT_Therapy_Use" in current_mapping.values():
            selected_columns.append("PSYT_Therapy_Use")
        if "H3_Tech_Validation" in current_mapping.values():
            selected_columns.append("H3_Tech_Validation")

        for col in selected_columns:
            if col not in df_named.columns:
                df_named[col] = np.nan

        df_selected = df_named[selected_columns].copy()

        for col in UNIVERSAL_FEATURES:
            if df_selected[col].dtype == "object":
                print(f"Converting categorical column {col} to numerical in {source_name}")
                df_selected[col] = df_selected[col].astype(str).str.lower().replace(
                    {
                        "yes": 1,
                        "no": 0,
                        "often": 1,
                        "rarely": 0,
                        "sometimes": 0.5,
                        "maybe": 0.5,
                        "most of the time": 1,
                        "never": 0,
                        "always": 1,
                        "not sure": 0.5,
                        "high": 1,
                        "low": 0,
                        "medium": 0.5,
                        "somewhat easy": 0.5,
                        "somewhat difficult": 0.5,
                        "very difficult": 1,
                        "very easy": 0,
                    }
                )
                df_selected[col] = pd.to_numeric(df_selected[col], errors="coerce")

        df_selected["Source_Group"] = source_name
        all_data_frames.append(df_selected)

    except Exception as exc:
        print(f"Error loading {source_name} dataset: {exc}")
        missing_sources.append(source_name)

if not all_data_frames:
    print("No datasets were processed. Exiting.")
    sys.exit(1)

if missing_sources:
    print("\nDatasets with missing files or errors:")
    for name in missing_sources:
        print(f"- {name}")

df_combined = pd.concat(all_data_frames, ignore_index=True)
print(f"\nCombined dataset shape: {df_combined.shape}")

print("\nData Cleaning: Handling missing values")
for feature in UNIVERSAL_FEATURES:
    mean_value = df_combined[feature].mean(skipna=True)
    df_combined[feature] = df_combined[feature].fillna(mean_value)

print("\nNormalizing Universal Features using Z-score normalization")
scaler = StandardScaler()
df_combined[UNIVERSAL_FEATURES] = scaler.fit_transform(df_combined[UNIVERSAL_FEATURES])
joblib.dump(scaler, "universal_features_scaler.joblib")
print("Normalization complete. Scaler saved as 'universal_features_scaler.joblib'.")

fused_output = "fused_mental_health_dataset.csv"
df_combined.to_csv(fused_output, index=False)
print(f"\nFused dataset saved as '{fused_output}'")

for source_name, output_file in OUTPUT_PATHS.items():
    df_subset = df_combined[df_combined["Source_Group"] == source_name]
    if df_subset.empty:
        print(f"Skipping save for {source_name} because it has no processed rows.")
        continue
    df_subset.to_csv(output_file, index=False)
    print(f"{source_name} processed dataset saved as '{output_file}'")

print("\nData processing complete.")


File not found for D1-Swiss: data\Data Carrard et al. 2022 MedTeach.csv
Loaded D2-Cultural dataset with shape: (101, 11)
Converting categorical column Depression to numerical in D2-Cultural
Converting categorical column Anxiety to numerical in D2-Cultural
Converting categorical column Burnout to numerical in D2-Cultural
Converting categorical column Stress to numerical in D2-Cultural
Loaded D3-Academic dataset with shape: (27901, 18)
Converting categorical column Stress to numerical in D3-Academic


  df_selected[col] = df_selected[col].astype(str).str.lower().replace(
  df_selected[col] = df_selected[col].astype(str).str.lower().replace(
  df_selected[col] = df_selected[col].astype(str).str.lower().replace(


Loaded D4-Tech dataset with shape: (1259, 27)
Converting categorical column Depression to numerical in D4-Tech
Converting categorical column Anxiety to numerical in D4-Tech
Converting categorical column Burnout to numerical in D4-Tech

Datasets with missing files or errors:
- D1-Swiss

Combined dataset shape: (29261, 6)

Data Cleaning: Handling missing values

Normalizing Universal Features using Z-score normalization
Normalization complete. Scaler saved as 'universal_features_scaler.joblib'.


  df_selected[col] = df_selected[col].astype(str).str.lower().replace(



Fused dataset saved as 'fused_mental_health_dataset.csv'
Skipping save for D1-Swiss because it has no processed rows.
D2-Cultural processed dataset saved as 'D2_Cultural_processed.csv'
D3-Academic processed dataset saved as 'D3_Academic_processed.csv'
D4-Tech processed dataset saved as 'D4_Tech_processed.csv'

Data processing complete.
