# Logistic Regression Baseline
This notebook builds a baseline Logistic Regression model using a ColumnTransformer preprocessing pipeline (median imputation + scaling for numeric features, most-frequent imputation + OneHotEncoding for categorical features). We run cross-validation and a holdout evaluation, then save the trained model.

In [21]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import pickle
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Load train data
df = pd.read_csv('train.csv')
df.shape

(140700, 20)

In [23]:
# Drop Name if present and separate X/y
if 'Name' in df.columns:
    df = df.drop(columns=['Name'])

# Target and features
y = df['Depression']
X = df.drop(columns=['Depression'])

# Identify columns
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# remove id from features if present
if 'id' in numerical_cols:
    numerical_cols.remove('id')
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print('Numerical columns:', numerical_cols)
print('Categorical columns:', categorical_cols)

Numerical columns: ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']
Categorical columns: ['Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [24]:
# Build preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

clf = Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000))])

clf

In [25]:
# Cross-validate (Stratified K-Fold) with accuracy and ROC AUC
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
roc_scores = cross_val_score(clf, X, y, cv=skf, scoring='roc_auc')
print(f'Accuracy CV: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}')
print(f'ROC AUC CV: {roc_scores.mean():.4f} ± {roc_scores.std():.4f}')

Accuracy CV: 0.9387 ± 0.0010
ROC AUC CV: 0.9744 ± 0.0009


In [26]:
# Holdout evaluation: train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
y_proba = clf.predict_proba(X_val)[:,1] if hasattr(clf, 'predict_proba') else None

print('Validation accuracy:', accuracy_score(y_val, y_pred))
if y_proba is not None:
    print('Validation ROC AUC:', roc_auc_score(y_val, y_proba))

print('Classification report:')
print(classification_report(y_val, y_pred))

Validation accuracy: 0.937455579246624
Validation ROC AUC: 0.9730392431860723
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     23027
           1       0.84      0.81      0.82      5113

    accuracy                           0.94     28140
   macro avg       0.90      0.89      0.89     28140
weighted avg       0.94      0.94      0.94     28140



# LOG REG w/ feature selection

In [28]:
# Logistic + feature selection (L1 SelectFromModel)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
import joblib
import numpy as np

# Reuse X, y, numerical_cols, categorical_cols and preprocessor defined earlier in the notebook.
# If those names are not available, re-run the prior cells that created them.

# Build a pipeline: preprocessing -> feature selector -> classifier
# Use L1 logistic as the estimator for feature selection (sparse selection)
selector_estimator = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=2000, random_state=42)

pipeline_fs = Pipeline(steps=[
    ('preprocessor', preprocessor),  # existing ColumnTransformer from earlier
    ('selector', SelectFromModel(selector_estimator, threshold='median')),  # keep features above median importance
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

In [29]:
# Cross-validate
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_fs = cross_val_score(pipeline_fs, X, y, cv=skf, scoring='accuracy', n_jobs=-1)
roc_fs = cross_val_score(pipeline_fs, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

print("Feature-selection Logistic CV accuracy: {:.4f} ± {:.4f}".format(acc_fs.mean(), acc_fs.std()))
print("Feature-selection Logistic CV ROC AUC: {:.4f} ± {:.4f}".format(roc_fs.mean(), roc_fs.std()))

# Holdout training & evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline_fs.fit(X_train, y_train)
y_pred = pipeline_fs.predict(X_val)
y_proba = pipeline_fs.predict_proba(X_val)[:,1] if hasattr(pipeline_fs, 'predict_proba') else None

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
print("Holdout accuracy (FS):", accuracy_score(y_val, y_pred))
if y_proba is not None:
    print("Holdout ROC AUC (FS):", roc_auc_score(y_val, y_proba))
print(classification_report(y_val, y_pred))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

KeyboardInterrupt: 

# LOG REG w/ Target/Ordinal Encoding for high cardinality categorical columns

In [None]:
# Target / Ordinal encoding pipeline for high-cardinality categorical variables
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
import joblib
import numpy as np
import pandas as pd

# Detect categorical cardinalities
CARDINALITY_HIGH = 20    # threshold for "high" cardinality (tweak as needed)
CARDINALITY_MED = 6      # medium cardinality threshold

categorical_cols_all = categorical_cols  # from earlier
high_card_cols = [c for c in categorical_cols_all if X[c].nunique(dropna=False) > CARDINALITY_HIGH]
medium_card_cols = [c for c in categorical_cols_all if CARDINALITY_MED < X[c].nunique(dropna=False) <= CARDINALITY_HIGH]
low_card_cols = [c for c in categorical_cols_all if X[c].nunique(dropna=False) <= CARDINALITY_MED]

print("High-cardinality categorical cols (target-encode):", high_card_cols)
print("Medium-cardinality categorical cols (ordinal-encode):", medium_card_cols)
print("Low-cardinality categorical cols (one-hot):", low_card_cols)

# Try to import TargetEncoder; if not available, provide a safe sklearn-compatible implementation
try:
    from category_encoders import TargetEncoder
    print("Using category_encoders.TargetEncoder")
except Exception:
    print("category_encoders not installed — using simple smoothed target encoding fallback")

    class TargetEncoder(BaseEstimator, TransformerMixin):
        """
        Simple target encoder with smoothing. Expects X to be a DataFrame with categorical columns to encode.
        This encoder is fit per-column and stores mapping to target mean with smoothing.
        NOTE: This is a lightweight fallback. For production or advanced smoothing, install `category_encoders`.
        """
        def __init__(self, cols=None, smoothing=1.0):
            self.cols = cols
            self.smoothing = smoothing
            self.maps_ = {}
            self.global_mean_ = None

        def fit(self, X, y):
            X = pd.DataFrame(X).copy()
            self.global_mean_ = float(pd.Series(y).mean())
            cols = self.cols if self.cols is not None else X.columns.tolist()
            for col in cols:
                stats = pd.concat([X[col], pd.Series(y, name='y')], axis=1)
                agg = stats.groupby(col)['y'].agg(['count', 'mean'])
                # smoothing
                counts = agg['count']
                means = agg['mean']
                smooth = (counts * means + self.smoothing * self.global_mean_) / (counts + self.smoothing)
                self.maps_[col] = smooth.to_dict()
            return self

        def transform(self, X):
            X = pd.DataFrame(X).copy()
            for col, mapping in self.maps_.items():
                # map unknown categories to global mean
                X[col] = X[col].map(mapping).fillna(self.global_mean_)
            return X

# Build transformers:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# For low-cardinality, use OneHotEncoder; for medium, use OrdinalEncoder; for high, use TargetEncoder
from sklearn.preprocessing import OneHotEncoder

transformers = []
if len(numerical_cols) > 0:
    transformers.append(('num', numeric_transformer, numerical_cols))
if len(low_card_cols) > 0:
    transformers.append(('low_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), low_card_cols))
if len(medium_card_cols) > 0:
    transformers.append(('med_ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), medium_card_cols))

# High-cardinality will be handled by a custom transformer (TargetEncoder) applied separately, because it expects a DataFrame input.
# We'll build a pipeline that first applies ColumnTransformer for num/low/med, and *before* that uses a passthrough for high-card cols
# Approach: use a ColumnTransformer where high-cardinality columns are passed through uninterpreted, then wrap with a small custom pipeline
from sklearn.preprocessing import FunctionTransformer

def select_cols(df, cols):
    return df[cols]

# Pipeline piece to pass high-cardinality columns through (so we can apply target encoder on them)
if len(high_card_cols) > 0:
    transformers.append(('high_pass', FunctionTransformer(lambda df: df[high_card_cols], validate=False), high_card_cols))

preprocessor_te = ColumnTransformer(transformers=transformers, remainder='drop')

# Now build the full pipeline: apply preprocessor -> combine with target-encoded high-card columns (if any) -> classifier
# To keep this simple and sklearn-compatible, we'll build a pipeline that first applies preprocessor_te to get numeric + low/med features,
# and then applies a custom transformer that concatenates the preprocessed array with target-encoded high-card columns (if present).
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin

class HighCardTargetConcat(TransformerMixin, BaseEstimator):
    """
    Fit: learns target-encoding maps for high-card cols (if any).
    Transform: returns numpy array with preprocessed_part (array) concatenated with encoded high-card columns (array).
    """
    def __init__(self, preprocessor, high_cols, smoothing=1.0):
        self.preprocessor = preprocessor
        self.high_cols = high_cols
        self.smoothing = smoothing
        self.target_encoder_ = None

    def fit(self, X, y=None):
        # Fit preprocessor (which expects a DataFrame for FunctionTransformer parts)
        if hasattr(self.preprocessor, 'fit'):
            # preprocessor.fit will ignore the high-card cols for the parts that don't use them,
            # and FunctionTransformer for high_pass returns DataFrame subset that we can use separately.
            self.preprocessor.fit(X, y)
        # Fit target encoder only on high-card columns
        if len(self.high_cols) > 0:
            X_high = X[self.high_cols]
            # instantiate encoder (either category_encoders.TargetEncoder or our fallback)
            try:
                from category_encoders import TargetEncoder as _TE
                self.target_encoder_ = _TE(cols=self.high_cols, smoothing=self.smoothing)
            except Exception:
                self.target_encoder_ = TargetEncoder(cols=self.high_cols, smoothing=self.smoothing)
            self.target_encoder_.fit(X_high, y)
        return self

    def transform(self, X):
        # preprocessed part
        preproc_arr = self.preprocessor.transform(X)
        if len(self.high_cols) > 0:
            X_high = X[self.high_cols]
            high_enc = self.target_encoder_.transform(X_high)
            # high_enc may be DataFrame; convert to numpy
            high_arr = np.asarray(high_enc)
            # concatenate along features
            return np.hstack([preproc_arr, high_arr])
        else:
            return preproc_arr

# Build classifier pipeline with the custom concatenator
from sklearn.linear_model import LogisticRegression

concat_transformer = HighCardTargetConcat(preprocessor=preprocessor_te, high_cols=high_card_cols, smoothing=10.0)

pipeline_te = Pipeline([
    ('concat', concat_transformer),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# Evaluate with CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_te = cross_val_score(pipeline_te, X, y, cv=skf, scoring='accuracy', n_jobs=-1)
roc_te = cross_val_score(pipeline_te, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

print("Target-encoding Logistic CV accuracy: {:.4f} ± {:.4f}".format(acc_te.mean(), acc_te.std()))
print("Target-encoding Logistic CV ROC AUC: {:.4f} ± {:.4f}".format(roc_te.mean(), roc_te.std()))

# Holdout training & evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline_te.fit(X_train, y_train)
y_pred = pipeline_te.predict(X_val)
y_proba = pipeline_te.predict_proba(X_val)[:,1] if hasattr(pipeline_te, 'predict_proba') else None

print("Holdout accuracy (TE):", accuracy_score(y_val, y_pred))
if y_proba is not None:
    print("Holdout ROC AUC (TE):", roc_auc_score(y_val, y_proba))
print(classification_report(y_val, y_pred))


High-cardinality categorical cols (target-encode): ['City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree']
Medium-cardinality categorical cols (ordinal-encode): []
Low-cardinality categorical cols (one-hot): ['Gender', 'Working Professional or Student', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
Using category_encoders.TargetEncoder


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'