# Logistic Regression Baseline
This notebook builds a baseline Logistic Regression model using a ColumnTransformer preprocessing pipeline (median imputation + scaling for numeric features, most-frequent imputation + OneHotEncoding for categorical features). We run cross-validation and a holdout evaluation, then save the trained model.

In [21]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import pickle
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Load train data
df = pd.read_csv('train.csv')
df.shape

(140700, 20)

In [23]:
# Drop Name if present and separate X/y
if 'Name' in df.columns:
    df = df.drop(columns=['Name'])

# Target and features
y = df['Depression']
X = df.drop(columns=['Depression'])

# Identify columns
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# remove id from features if present
if 'id' in numerical_cols:
    numerical_cols.remove('id')
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print('Numerical columns:', numerical_cols)
print('Categorical columns:', categorical_cols)

Numerical columns: ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']
Categorical columns: ['Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [24]:
# Build preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

clf = Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000))])

clf

In [25]:
# Cross-validate (Stratified K-Fold) with accuracy and ROC AUC
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
roc_scores = cross_val_score(clf, X, y, cv=skf, scoring='roc_auc')
print(f'Accuracy CV: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}')
print(f'ROC AUC CV: {roc_scores.mean():.4f} ± {roc_scores.std():.4f}')

Accuracy CV: 0.9387 ± 0.0010
ROC AUC CV: 0.9744 ± 0.0009


In [26]:
# Holdout evaluation: train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
y_proba = clf.predict_proba(X_val)[:,1] if hasattr(clf, 'predict_proba') else None

print('Validation accuracy:', accuracy_score(y_val, y_pred))
if y_proba is not None:
    print('Validation ROC AUC:', roc_auc_score(y_val, y_proba))

print('Classification report:')
print(classification_report(y_val, y_pred))

Validation accuracy: 0.937455579246624
Validation ROC AUC: 0.9730392431860723
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     23027
           1       0.84      0.81      0.82      5113

    accuracy                           0.94     28140
   macro avg       0.90      0.89      0.89     28140
weighted avg       0.94      0.94      0.94     28140

