# import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from utils.text_cleaner import TextCleanerNB, TextCleanerTFIDF

In [35]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# Import dataset

In [2]:
df = pd.read_csv("./data/mental_health_imbalanced.csv")
df.head(2).T

Unnamed: 0,0,1
Unique_ID,0.0,1.0
text,oh my gosh,"trouble sleeping, confused mind, restless hear..."
status,Anxiety,Anxiety


# Explore dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49612 entries, 0 to 49611
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Unique_ID  40012 non-null  float64
 1   text       49612 non-null  object 
 2   status     49612 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


# remove redundant feature

In [4]:
df = df.drop("Unique_ID",axis=1)

# remove duplicate records from dataset

In [5]:
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()

# row count on each label to check class imbalance

In [6]:
df["status"].value_counts(normalize=True)

status
Normal        0.370769
Depression    0.291472
Suicidal      0.228863
Anxiety       0.108896
Name: proportion, dtype: float64

# Separate X and y variable

In [7]:
X = df.drop(["status"],axis=1)
y = df["status"]

# Train and test data split 

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [9]:
cleaner = TextCleanerTFIDF()
X_clean = cleaner.transform(X_train)

print(len(X_train), len(X_clean))   # must match


39164 39164


# Build pipeline with all steps 

In [36]:
# pipeline = Pipeline(
#     steps=[
#         ("cleaner", TextCleanerTFIDF()),
#         ("vectorizer", TfidfVectorizer(ngram_range=(1, 2),min_df=3,sublinear_tf=True,norm="l2")),
#         ("model", LogisticRegression(max_iter=1000,class_weight="balanced",C=0.7,n_jobs=-1))
#     ]
# )


pipeline = Pipeline(
    steps=[
        ("cleaner", TextCleanerTFIDF()),
        ("vectorizer", TfidfVectorizer(ngram_range=(1, 1),min_df=5,max_df=0.9,max_features=30000,sublinear_tf=True)),
        ("model", LogisticRegression(max_iter=2000,C=0.3,solver="liblinear",class_weight="balanced"
        ))
    ]
)

param_grid = {
    "vectorizer__min_df": [2, 3, 5],
    "vectorizer__max_features": [20000, 30000, 40000],
    "model__C": [0.2, 0.5, 1.0]
}

grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3, # 3 fold
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [39]:
model = grid.best_estimator_

In [37]:
grid.best_params_

{'model__C': 1.0, 'vectorizer__max_features': 20000, 'vectorizer__min_df': 5}

In [42]:
grid.best_score_

np.float64(0.7624799842546054)

In [20]:
pipeline2 = Pipeline(
    steps=[
        ("cleaner", TextCleanerTFIDF()),
        ("vectorizer", TfidfVectorizer(ngram_range=(1, 2),min_df=3,sublinear_tf=True,norm="l2")),
        ("model", LinearSVC(class_weight="balanced",C=1.0))
    ]
)
# it increasing the gap and causes overfitting so it's rejected

In [29]:
pipeline.fit(X_train,y_train)

In [32]:
# pipeline2.fit(X_train,y_train)

# classification report on train data(LR)

In [40]:
y_train_pred = model.predict(X_train)
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

     Anxiety       0.82      0.89      0.85      4265
  Depression       0.82      0.70      0.76     11415
      Normal       0.89      0.95      0.92     14521
    Suicidal       0.74      0.76      0.75      8963

    accuracy                           0.83     39164
   macro avg       0.82      0.83      0.82     39164
weighted avg       0.83      0.83      0.83     39164



# classification report on test data (LR)

In [41]:
y_test_pred = model.predict(X_test)
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

     Anxiety       0.77      0.81      0.79      1066
  Depression       0.75      0.66      0.70      2854
      Normal       0.88      0.94      0.91      3630
    Suicidal       0.69      0.71      0.70      2241

    accuracy                           0.79      9791
   macro avg       0.77      0.78      0.78      9791
weighted avg       0.79      0.79      0.79      9791



# classification report on train data(SVM)

In [33]:
# y_train_pred = pipeline2.predict(X_train)
# print(classification_report(y_train,y_train_pred))

# not required at the end because we already rejected

# classification report on test data (SVM)

In [34]:
# y_test_pred = pipeline2.predict(X_test)
# print(classification_report(y_test,y_test_pred))

# not required at the end because we already rejected

In [6]:
import sklearn, pandas, numpy, fastapi, uvicorn, joblib, pydantic, emoji, contractions

libs = {
    "scikit-learn": sklearn.__version__,
    "pandas": pandas.__version__,
    "numpy": numpy.__version__,
    "fastapi": fastapi.__version__,
    "uvicorn": uvicorn.__version__,
    "joblib": joblib.__version__,
    "pydantic": pydantic.__version__,
    "emoji": emoji.__version__
    
}

libs

{'scikit-learn': '1.6.1',
 'pandas': '2.2.3',
 'numpy': '2.1.3',
 'fastapi': '0.127.1',
 'uvicorn': '0.40.0',
 'joblib': '1.4.2',
 'pydantic': '2.10.3',
 'emoji': '2.15.0'}

In [5]:
import contractions
print(contractions.__version__)

AttributeError: module 'contractions' has no attribute '__version__'