# Logistic Regression with Ordinal Encoding
This notebook builds a Logistic Regression model that uses ordinal encoding for categorical variables (useful when you want compact representations rather than OneHot). Includes preprocessing, CV, holdout evaluation, and model save.

In [1]:
# Imports
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
df = pd.read_csv('train.csv')
if 'Name' in df.columns:
    df = df.drop(columns=['Name'])
y = df['Depression']
X = df.drop(columns=['Depression'])
print('X shape:', X.shape)
print('y distribution:')
print(y.value_counts(normalize=True))

In [None]:
# Identify columns
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numerical_cols:
    numerical_cols.remove('id')
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print('numerical_cols:', numerical_cols)
print('categorical_cols:', categorical_cols)

In [None]:
# Preprocessing: median impute + scale for numerics, most_frequent + OrdinalEncoder for categoricals
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline_ord = Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000, random_state=42))])
pipeline_ord

In [None]:
# Cross-validate (single cross_validate call)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'roc_auc']
cv_res = cross_validate(pipeline_ord, X, y, cv=skf, scoring=scoring, n_jobs=1)
print('Accuracy CV: {:.4f} ± {:.4f}'.format(cv_res['test_accuracy'].mean(), cv_res['test_accuracy'].std()))
print('ROC AUC CV: {:.4f} ± {:.4f}'.format(cv_res['test_roc_auc'].mean(), cv_res['test_roc_auc'].std()))

In [None]:
# Holdout evaluation and save
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline_ord.fit(X_train, y_train)
y_pred = pipeline_ord.predict(X_val)
y_proba = pipeline_ord.predict_proba(X_val)[:,1] if hasattr(pipeline_ord, 'predict_proba') else None
print('Validation accuracy:', accuracy_score(y_val, y_pred))
if y_proba is not None:
    print('Validation ROC AUC:', roc_auc_score(y_val, y_proba))
print('Classification report:')
print(classification_report(y_val, y_pred))

# Save pipeline
joblib.dump({'model': pipeline_ord, 'meta': {'method': 'OrdinalEncoding', 'sklearn_version': __import__('sklearn').__version__}}, 'logistic_ord.joblib')
print('Saved logistic_ord.joblib')