In [1]:
import pandas as pd
import re
import joblib
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load dataset
df = pd.read_csv("C:/Users/sagni/Downloads/Mental Health Detection/abcnews-date-text.csv")

# 🧠 Assign multi-class labels using keywords
def assign_label(text):
    text = text.lower()
    if any(word in text for word in ['depression', 'depressed', 'hopeless', 'worthless']):
        return 1  # Depression
    elif any(word in text for word in ['anxiety', 'anxious', 'nervous', 'panic']):
        return 2  # Anxiety
    elif any(word in text for word in ['stress', 'stressed', 'pressure', 'overwhelmed']):
        return 3  # Stress
    else:
        return 0  # Normal

df['label'] = df['headline_text'].apply(assign_label)

# 🧹 Clean text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['headline_text'].apply(clean_text)

# 📊 Show label counts
print("Label Distribution:\n", df['label'].value_counts())

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🧠 Train Logistic Regression (multi-class)
clf = LogisticRegression(max_iter=200, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("\n✅ Accuracy:", clf.score(X_test, y_test))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=['Normal', 'Depression', 'Anxiety', 'Stress']))

# Save model & vectorizer
joblib.dump(clf, "C:/Users/sagni/Downloads/Mental Health Detection/multiclass_model.joblib")
joblib.dump(vectorizer, "C:/Users/sagni/Downloads/Mental Health Detection/multiclass_vectorizer.joblib")
print("✅ Model & vectorizer saved.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sagni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Label Distribution:
 label
0    1239018
3       3754
2        931
1        481
Name: count, dtype: int64





✅ Accuracy: 0.9535639796332539

📊 Classification Report:
               precision    recall  f1-score   support

      Normal       1.00      0.95      0.98    247758
  Depression       0.13      0.79      0.22       111
     Anxiety       0.02      0.50      0.03       176
      Stress       0.12      0.84      0.21       792

    accuracy                           0.95    248837
   macro avg       0.31      0.77      0.36    248837
weighted avg       1.00      0.95      0.97    248837

✅ Model & vectorizer saved.
