# Kaggle Playground Series - Exploring Mental Health Data

This is a simplified notebook for the following competition:

[Kaggle Playground Series - s04-e11](https://www.kaggle.com/competitions/playground-series-s4e11/overview)

## Configuration

In [400]:
import pandas as pd
import os
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from kaggle import KaggleApi

from scipy.stats import chi2_contingency, randint, ttest_ind, mannwhitneyu, ks_2samp, pointbiserialr
from statsmodels.graphics.mosaicplot import mosaic

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier


### Helper Functions

In [401]:
def kaggle_download(competition_name, output_dir):
    api = KaggleApi()
    api.authenticate()
    os.makedirs(output_dir, exist_ok=True)
    api.competition_download_files(competition_name, path=output_dir)
    zip_path = os.path.join(output_dir, f"{competition_name}.zip")
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
        print(f"Extracted all files to '{output_dir}'")
        os.remove(zip_path)
    else:
        print(f"No zip file found at '{zip_path}'")

def categorical_tests(df, feature, label):
    cont_table = pd.crosstab(df[feature], df[label])
    chi2, p, _, _ = chi2_contingency(cont_table)
    n = cont_table.values.sum()
    cramers_v = np.sqrt(chi2 / (n * (min(cont_table.shape) - 1)))
    print(f"Feature: {feature}")
    print(f"Chi-Square Statistic: {round(chi2, 2)}")
    print(f"P-value: {round(p, 10)}")
    print(f"Cramér's V: {round(cramers_v, 3)}")

def categorical_exploration(df, feature, label, n):
    top_cats = df[feature].value_counts().index[:n]
    top = df[df[feature].isin(top_cats)]
    _, ax = plt.subplots(figsize=(8, 4))
    mosaic(top, [feature, label], ax=ax)
    plt.xlabel(feature)
    plt.show()
    categorical_tests(top, feature, label)

def top_contributors(df, feature, prop):
    categories = df[feature].value_counts(normalize=True, dropna=False).to_frame().reset_index()
    categories["cumulative"] = categories["proportion"].cumsum()
    categories = categories[categories["proportion"] >= prop]
    return categories, categories[feature].to_list()

def numeric_exploration(df, feature, label):
    sns.boxenplot(data=df, x=label, y=feature)
    group0 = df[df[label] == 0][feature]
    group1 = df[df[label] == 1][feature]
    stat, p_value = ttest_ind(group0, group1, equal_var=False)
    print(f"Statistic: {stat}, p-value: {p_value}")
    stat, p_value = mannwhitneyu(group0, group1, alternative='two-sided')
    print(f"Statistic: {stat}, p-value: {p_value}")
    stat, p_value = ks_2samp(group0, group1)
    print(f"Statistic: {stat}, p-value: {p_value}")
    correlation, p_value = pointbiserialr(df[feature], df[label])
    print(f"Correlation: {correlation}, p-value: {p_value}")

### Load, Extract and Transform Data

In [None]:
kaggle_download("playground-series-s4e11", "data")   

In [None]:
# load the training dataset
train_df = pd.read_csv("data/train.csv")

# drop the id columns
train_df.drop("id", axis=1, inplace=True)

# explore the data types
train_df.dtypes

Rename & reformat columns names.

In [None]:
rename_dict = {
        "Have you ever had suicidal thoughts ?" : "suicidal thoughts",
        "Family History of Mental Illness" : "family history",
        "Working Professional or Student" : "professional or student"
    }

train_df = train_df.rename(
    columns=rename_dict
)

train_df.columns = [col.lower() for col in train_df.columns]

list(train_df.columns)

## EDA and Feature Engineering

In [405]:
label = "depression"

### Categorical Features

Ignore the name field as a potential feature.

In [None]:
train_df.select_dtypes(include=["object"]).describe()

#### Binary Categories

In [None]:
binary_features = ["gender", "professional or student", "suicidal thoughts", "family history"]
train_df.isnull().sum()

In [None]:
for feature in binary_features:
    categorical_exploration(train_df, feature, label, 2)

#### Sleep Duration

In [409]:
feature = "sleep duration"

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

In [None]:
results, categories = top_contributors(train_df, feature, 0.01)
results

Sleep Duration is an inconsistent column, the most common responses (99%) are:
- Less than 5 hours (28%)
- 7-8 hours (26%)
- More than 8 hours (23%)
- 5-6 hours (22%)

In [None]:
n_categories = len(categories)
categorical_exploration(train_df, feature, "depression", n_categories)

As the majority of records are part of these four categories, the remainder of categories will be invalidated and infilled as part of preprocessing.

In [None]:
train_df[feature] = train_df[feature].apply(lambda x: x if x in categories else "Unknown")
train_df[feature].value_counts(dropna=False)

#### Dietary Habits

In [414]:
feature = "dietary habits"

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

In [None]:
results, categories = top_contributors(train_df, feature, 0.01)
results

Dietary Habits is an inconsistent column, the most common responses (99%) are:
- Moderate (35%)
- Unhealthy (32%)
- Healthy (32%)

In [None]:
n_categories = len(categories)
categorical_exploration(train_df, feature, "depression", n_categories)

In [None]:
train_df[feature] = train_df[feature].apply(lambda x: x if x in categories else "Unknown")
train_df[feature].value_counts(dropna=False)

#### Degree

In [419]:
feature = "degree"

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

Degree is an inconsistent column with high cardinality, will groups into the following categories:
- C - High School
- B - Bachelors
- M - Masters
- P - PhD
- L - (???)
- U - Unknown

In [421]:
train_df[feature] = train_df[feature].apply(lambda x: x[:1] if isinstance(x, str) else "Unknown")
train_df[feature] = train_df[feature].apply(lambda x: x if x in ["B", "M", "C", "L", "P"] else "Unknown")

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

In [None]:
results, categories = top_contributors(train_df, feature, 0.01)
results

In [None]:
n_categories = len(categories)
categorical_exploration(train_df, feature, "depression", n_categories)

#### City

In [425]:
feature = "city"

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

In [None]:
results, categories = top_contributors(train_df, feature, 0.01)
results

In [None]:
n_categories = len(categories)
categorical_exploration(train_df, feature, "depression", n_categories)

#### Profession

In [429]:
feature = "profession"

In [None]:
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

Infill the missing values with Student

In [None]:
train_df[feature] = train_df[feature].apply(lambda x: "Student" if pd.isna(x) else x)
categories = train_df[feature].value_counts(normalize=True, dropna=False)
categories

In [None]:
results, categories = top_contributors(train_df, feature, 0.01)
results

In [None]:
n_categories = len(categories)
categorical_exploration(train_df, feature, "depression", n_categories)

## Numeric Feature Engineering

In [None]:
train_df.select_dtypes(exclude=["object"]).drop("depression", axis=1).describe()

In [None]:
train_df.select_dtypes(exclude=["object"]).drop("depression", axis=1).isnull().sum()

#### Satisfaction (job / study)

In [None]:
categorical_exploration(train_df.sort_values("job satisfaction"), "job satisfaction", "depression", 100)
categorical_exploration(train_df.sort_values("study satisfaction"), "study satisfaction", "depression", 100)

create a combined satisfaction column

In [437]:
train_df["satisfaction"] = np.floor(train_df[["study satisfaction", "job satisfaction"]].mean(axis=1))

In [None]:
categorical_exploration(train_df.sort_values("satisfaction"), "satisfaction", "depression", 100)

#### Pressure (academic / work)

In [None]:
categorical_exploration(train_df.sort_values("academic pressure"), "academic pressure", "depression", 100)
categorical_exploration(train_df.sort_values("work pressure"), "work pressure", "depression", 100)

In [440]:
train_df["pressure"] = np.floor(train_df[["academic pressure", "work pressure"]].mean(axis=1))

In [None]:
categorical_exploration(train_df.sort_values("pressure"), "pressure", "depression", 100)

#### Work / Study Hours

In [442]:
feature = "work/study hours"

In [None]:
numeric_exploration(train_df, feature, label)

#### Age

In [444]:
feature = "age"

In [None]:
numeric_exploration(train_df, feature, label)

#### Financial Stress

In [446]:
feature = "financial stress"

In [None]:
numeric_exploration(train_df, feature, label)

In [None]:
subset = train_df[["age", "pressure", "satisfaction", "work/study hours", "financial stress", "depression"]]
sns.heatmap(data=subset.corr(), cmap="coolwarm", annot=True)

In [449]:
# sns.pairplot(data=subset.sample(frac=0.1, replace=False), hue="depression")

## Modelling

### Data Categories

In [450]:
ordinal_dict = {
    "dietary habits" : ["Unhealthy", "Moderate", "Healthy", "Unknown"],
    "sleep duration" : ["Less than 5 hours", "5-6 hours", "7-8 hours", "More than 8 hours", "Unknown"],
    "degree" : ["C", "B", "M", "L", "P", "Unknown"],
}

# one hot encoding list
ohe_list = ["gender", "professional or student", "family history", "suicidal thoughts"]

# ordinal encoding list
ord_list = list(ordinal_dict.keys())

# numeric list
num_list = ["work/study hours", "age", "satisfaction", "pressure", "financial stress"]

### Data Preparation

In [451]:
def preprocessing(df, ord_dict, rename_dict):
    df.rename(columns=rename_dict, inplace=True)
    df.columns = [col.lower() for col in df.columns]
    df["pressure"] = np.floor(df[["academic pressure", "work pressure"]].mean(axis=1))
    df["pressure"] = df["pressure"].apply(lambda x: x if x > 0 else 0)
    
    df["satisfaction"] = np.floor(df[["study satisfaction", "job satisfaction"]].mean(axis=1))
    df["satisfaction"] = df["satisfaction"].apply(lambda x: x if x > 0 else 0)
    
    df["financial stress"] = df["financial stress"].apply(lambda x: x if x > 0 else 0)
    
    
    df.drop(["academic pressure", "work pressure", "study satisfaction", "job satisfaction"], axis=1, inplace=True)
    
    df["degree"] = df["degree"].apply(lambda x: x[:1] if isinstance(x, str) else "Unknown")
    for feature, vals in ord_dict.items():
        df[feature] = df[feature].apply(lambda x: x if x in vals else "Unknown")
    return df

### Data Preprocessing

In [452]:
numeric_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', StandardScaler()) 
])

categorical_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('ohe', OneHotEncoder(drop="first")) 
])

ordinal_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('oe', OrdinalEncoder(categories=[val for val in ordinal_dict.values()])) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_preprocess, num_list),
        ('cat', categorical_preprocess, ohe_list),
        ('ord', ordinal_preprocess, ord_list),
    ]
)

In [453]:
X = train_df.drop("depression", axis=1)#[ohe_list+ord_list+num_list]#+lab_list]
y = train_df["depression"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
model = pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LGBMClassifier())
])

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [456]:
test_df = pd.read_csv("./data/test.csv")
test_df = preprocessing(test_df, ordinal_dict, rename_dict)

In [None]:
submission = pd.DataFrame(
    {
        "id": test_df["id"].values,
        "Depression": model.predict(test_df),
     }
)

submission.head()