Notebook setup & imports

In [1]:
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


Paths & configuration

In [2]:
# Paths
DATA_DIR = "../data/processed"
MODEL_DIR = "../models/baseline"

TRAIN_FILE = f"{DATA_DIR}/train_5class.csv"
VAL_FILE = f"{DATA_DIR}/val_5class.csv"

os.makedirs(MODEL_DIR, exist_ok=True)

LABELS = ["normal", "stress", "anxiety", "depression", "suicidal"]


Load processed data

In [3]:
train_df = pd.read_csv(TRAIN_FILE)
val_df = pd.read_csv(VAL_FILE)

print("Train size:", train_df.shape)
print("Validation size:", val_df.shape)

train_df.head()


Train size: (38999, 2)
Validation size: (9750, 2)


Unnamed: 0,text,label
0,@agpublic have you seen this link? me thinks y...,normal
1,"hey guys, i need advice on a situation that ha...",depression
2,life is fucking hard that s it we care which h...,depression
3,"heart attacks, can be avoided by these importa...",anxiety
4,suicide i do not know what to do i was having ...,depression


Split features & labels

In [4]:
X_train = train_df["text"].tolist()
y_train = train_df["label"].tolist()

X_val = val_df["text"].tolist()
y_val = val_df["label"].tolist()


TF-IDF vectorization

In [5]:
tfidf = TfidfVectorizer(
    max_features=50_000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)

X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)

print("TF-IDF train shape:", X_train_vec.shape)
print("TF-IDF val shape:", X_val_vec.shape)


TF-IDF train shape: (38999, 50000)
TF-IDF val shape: (9750, 50000)


Train Logistic Regression

In [6]:
logreg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1,
    solver="lbfgs"
)

logreg.fit(X_train_vec, y_train)


Evaluation

In [7]:
y_pred = logreg.predict(X_val_vec)

print(" Classification Report (Baseline)\n")
print(classification_report(y_val, y_pred, digits=4))

print("Confusion Matrix (Baseline)\n")
print(confusion_matrix(y_val, y_pred))


 Classification Report (Baseline)

              precision    recall  f1-score   support

     anxiety     0.7673    0.8242    0.7947       768
  depression     0.8065    0.6536    0.7220      3080
      normal     0.9005    0.9345    0.9172      3254
      stress     0.5663    0.7679    0.6519       517
    suicidal     0.6840    0.7546    0.7175      2131

    accuracy                         0.7889      9750
   macro avg     0.7449    0.7870    0.7607      9750
weighted avg     0.7953    0.7889    0.7882      9750

Confusion Matrix (Baseline)

[[ 633   33   40   58    4]
 [ 106 2013  160  104  697]
 [  28   37 3041  117   31]
 [  50   42   17  397   11]
 [   8  371  119   25 1608]]


Save baseline

In [8]:
joblib.dump(logreg, f"{MODEL_DIR}/baseline_logreg.pkl")
joblib.dump(tfidf, f"{MODEL_DIR}/tfidf_vectorizer.pkl")

print("Baseline model and vectorizer saved")


Baseline model and vectorizer saved


In [None]:
import pandas as pd 
df=pd.read_csv("../data/processed/val_5class.csv")