In [2]:
# Install required packages for working with Kaggle datasets
%pip install kaggle opendatasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import kaggle
import os 
kaggle.api.dataset_download_files('osmi/mental-health-in-tech-survey', 
                                  path='./data', 
                                  unzip=True)

print("Dataset downloaded successfully!")

Dataset URL: https://www.kaggle.com/datasets/thedevastator/medical-student-mental-health
Dataset downloaded successfully!
Dataset downloaded successfully!


In [11]:
import pandas as pd
import glob
import os 

os.makedirs("./data", exist_ok=True)

data_files = glob.glob('./data/*')
print("Files in the data directory:")
for file in data_files:
    print(f"- {file}")

df = None
for file in data_files:
    if file.endswith('.csv'):
        try:
            print(f"\nTrying to load: {file}")
            
            if 'codebook' in file.lower():
                print("Skipping codebook file")
                continue
            elif 'data' in file.lower():
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
            else:
                df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
                print(f"Successfully loaded: {file}")
                print(f"Shape: {df.shape}")
                print(f"Columns: {df.columns.tolist()}")
                
                
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
            continue

if df is not None:
    print("\nFirst 5 rows:")
    print(df.head())
else:
    print("No dataset could be loaded")

Files in the data directory:
- ./data\Codebook Carrard et al. 2022 MedTeach.csv
- ./data\Data Carrard et al. 2022 MedTeach.csv

Trying to load: ./data\Codebook Carrard et al. 2022 MedTeach.csv
Skipping codebook file

Trying to load: ./data\Data Carrard et al. 2022 MedTeach.csv
Successfully loaded: ./data\Data Carrard et al. 2022 MedTeach.csv
Shape: (886, 20)
Columns: ['id', 'age', 'year', 'sex', 'glang', 'part', 'job', 'stud_h', 'health', 'psyt', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']

First 5 rows:
   id  age  year  sex  glang  part  job  stud_h  health  psyt  jspe  qcae_cog  \
0   2   18     1    1    120     1    0      56       3     0    88        62   
1   4   26     4    1      1     1    0      20       4     0   109        55   
2   9   21     3    2      1     0    0      36       3     0   106        64   
3  10   21     2    2      1     0    1      51       5     0   101        52   
4  13   21     3    1      1

In [None]:
import numpy as np
import pandas as pd
import sys
import joblib
from sklearn.preprocessing import StandardScaler

FILE_PATHS = {
    'D1-Swiss': 'Data Carrard et al. 2022 MedTeach.csv', 
    'D2-Cultural': 'Student Mental health (1).csv',     
    'D3-Academic': 'student_depression_dataset.csv',    
    'D4-Tech': 'survey.csv'                             
}
COLUMN_MAPPING = {
    # D1-Swiss: accurate mappings based on validated scales
    'D1-Swiss':      {'cesd': 'Depression', 'stai_t': 'Anxiety', 'mbi_ex': 'Burnout', 'mbi_cy': 'Stress', 'psyt': 'PSYT_Therapy_Use'}, 
    
    # D2-Cultural: Weak Proxies used for a "Fragmentation Stress Test"
    'D2-Cultural':   {'Do you have Depression?': 'Depression', 'Do you have Anxiety?': 'Anxiety', 'Do you have Panic attack?': 'Burnout', 'Your current year of Study': 'Stress'}, 
    
    'D3-Academic':   {'Depression': 'Depression', 'Academic Pressure': 'Anxiety', 'Study Satisfaction': 'Burnout', 'Financial Stress': 'Stress'}, 
    
    'D4-Tech':       {'mental_health_consequence': 'Depression', 'work_interfere': 'Anxiety', 'leave': 'Burnout', 'Age': 'Stress', 'treatment': 'H3_Tech_Validation'}, 
}

#mapping : translates raw column names into 4 Universal Features
UNIVERSAL_FEATURES = ['Depression', 'Anxiety', 'Burnout', 'Stress']

all_data_frames = []
missing_files = []

for source_name, file_path in FILE_PATHS.items():
    try:
        df = pd.read_csv(os.path.join('./data', file_path), encoding='utf-8', on_bad_lines='skip')
        print(f"Loaded {source_name} dataset with shape: {df.shape}")

        current_mapping = COLUMN_MAPPING[source_name]
        missing_cols = [src for src in current_mapping.keys() if src not in df.columns]
        if missing_cols:
            print(f"ERROR: Missing crucial columns in {file_path}: {missing_cols}")
            missing_files.append(file_path)
            continue
        df = df.rename(columns=current_mapping)

        selected_columns = UNIVERSAL_FEATURES.copy()

        if 'PYST_Therapy_Use' in current_mapping.values():
            selected_columns.append('PSYT_Therapy_Use')
        
        df_selected = df[selected_columns]
        df['Source_Group'] = source_name

        #Convert categorical to numerical where necessary
        for col in UNIVERSAL_FEATURES:
            if df[col].dtype == 'object': 
                print(f"Converting categorical column {col} to numerical in {source_name}")
                df[col] = df[col].astype(str).str.lower().replace({
                     'yes': 1, 'no': 0, 'often': 1, 'rarely': 0, 'sometimes': 0.5, 'maybe': 0.5,
                     'most of the time': 1, 'never': 0, 'always': 1, 'not sure': 0.5,
                     'high': 1, 'low': 0, 'medium': 0.5,
                     'somewhat easy': 0.5, 'somewhat difficult': 0.5, 'very difficult': 1, 'very easy': 0
                })
                df[col] = pd.to_numeric(df[col], errors='coerce')
                
        all_data_frames.append(df)

    except FileNotFoundError:
        print(f"File not found for {source_name}: {file_path}")
        missing_files.append(source_name)
    except Exception as e:
        print(f"Error loading {source_name} dataset: {str(e)}")
        missing_files.append(source_name)

if missing_files:
    print("\nDatasets with missing files or errors:")

df_combined = pd.concat(all_data_frames, ignore_index=True)
print(f"\nCombined dataset shape: {df_combined.shape}")

#Data Cleaning: Handle missing values by dropping rows with any NaNs in Universal Features
print("\nData Cleaning: Handling missing values")
for feature in UNIVERSAL_FEATURES:
    mean_value = df_combined[feature].mean()
    df_combined[feature].fillna(mean_value, inplace=True)

#Z Score Normalization
#Using StandardScaler from sklearn to normalize the Universal Features
print("\nNormalizing Universal Features using Z-score normalization")
scaler = StandardScaler()

df_combined[UNIVERSAL_FEATURES] = scaler.fit_transform(df_combined[UNIVERSAL_FEATURES])
joblib.dump(scaler, 'universal_features_scaler.joblib')
print("Normalization complete. Scaler saved as 'universal_features_scaler.joblib'.")

#Saving the fused dataframe
df_combined.to_csv('fused_mental_health_dataset.csv', index=False)
print("\nFused dataset saved as 'fused_mental_health_dataset.csv'")

df_d1 = df_combined[df_combined['Source_Group'] == 'D1-Swiss']
print(f"\nD1-Swiss dataset shape after processing: {df_d1.shape}")
df_d1.to_csv('D1_Swiss_processed.csv', index=False)
print("D1-Swiss processed dataset saved as 'D1_Swiss_processed.csv'")
print("\nData processing complete.")

Loaded D1-Swiss dataset with shape: (886, 20)
File not found for D2-Cultural: Student Mental health (1).csv
Loaded D3-Academic dataset with shape: (27901, 18)
Converting categorical column Stress to numerical in D3-Academic
Loaded D4-Tech dataset with shape: (1259, 27)
Converting categorical column Depression to numerical in D4-Tech
Converting categorical column Anxiety to numerical in D4-Tech
Converting categorical column Burnout to numerical in D4-Tech

Datasets with missing files or errors:


  df[col] = df[col].astype(str).str.lower().replace({
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined[feature].fillna(mean_value, inplace=True)



Combined dataset shape: (30046, 56)

Data Cleaning: Handling missing values

Normalizing Universal Features using Z-score normalization
Normalization complete. Scaler saved as 'universal_features_scaler.joblib'.

Fused dataset saved as 'fused_mental_health_dataset.csv'

D1-Swiss dataset shape after processing: (886, 56)
D1-Swiss processed dataset saved as 'D1_Swiss_processed.csv'

Data processing complete.

Fused dataset saved as 'fused_mental_health_dataset.csv'

D1-Swiss dataset shape after processing: (886, 56)
D1-Swiss processed dataset saved as 'D1_Swiss_processed.csv'

Data processing complete.
