# Naive Bayes sanity check
This notebook runs a quick Gaussian Naive Bayes baseline to sanity-check the pipeline and data. It uses simple preprocessing (median impute + scaling for numerics, most-frequent impute + ordinal encoding for categoricals), evaluates with cross-validation and a holdout split, and saves the fitted pipeline.

In [7]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')
sns.set_theme()

In [8]:
# Load dataset and basic cleaning
df = pd.read_csv('train.csv')
# Drop Name column if present
if 'Name' in df.columns:
    df = df.drop(columns=['Name'])

print('shape:', df.shape)
display(df.head())

shape: (140700, 19)


Unnamed: 0,id,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [9]:
# Prepare features and pipeline
y = df['Depression'] if 'Depression' in df.columns else None
X = df.drop(columns=['Depression']) if 'Depression' in df.columns else df.copy()

# Identify numeric and categorical columns
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numerical_cols:
    numerical_cols.remove('id')
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print('numerical_cols:', numerical_cols)
print('categorical_cols:', categorical_cols)

# Preprocessing: median impute + scale for numerics; most_frequent + OrdinalEncoder for categoricals
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)])

clf_nb = Pipeline([('preprocessor', preprocessor), ('clf', GaussianNB())])
clf_nb

numerical_cols: ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']
categorical_cols: ['Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


0,1,2
,steps,"[('preprocessor', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,priors,
,var_smoothing,1e-09


In [10]:
# Cross-validate and holdout evaluation
if y is None:
    raise RuntimeError('Target column') 

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'roc_auc']
cv_res = cross_validate(clf_nb, X, y, cv=skf, scoring=scoring, n_jobs=1)
print('Accuracy CV: {:.4f} ± {:.4f}'.format(cv_res['test_accuracy'].mean(), cv_res['test_accuracy'].std()))
print('ROC AUC CV: {:.4f} ± {:.4f}'.format(cv_res['test_roc_auc'].mean(), cv_res['test_roc_auc'].std()))

# Holdout
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
clf_nb.fit(X_train, y_train)
y_pred = clf_nb.predict(X_val)
y_proba = clf_nb.predict_proba(X_val)[:,1] if hasattr(clf_nb, 'predict_proba') else None
print('Holdout accuracy:', accuracy_score(y_val, y_pred))
if y_proba is not None:
    print('Holdout ROC AUC:', roc_auc_score(y_val, y_proba))
print('Classification report:')
print(classification_report(y_val, y_pred))

Accuracy CV: 0.8589 ± 0.0022
ROC AUC CV: 0.9235 ± 0.0018
Holdout accuracy: 0.8602345415778252
Holdout ROC AUC: 0.9231505127472576
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91     23027
           1       0.60      0.67      0.64      5113

    accuracy                           0.86     28140
   macro avg       0.76      0.79      0.77     28140
weighted avg       0.87      0.86      0.86     28140

