In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.3 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 2.0 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.7 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [4]:
import pandas as pd
import re
import joblib
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("C:/Users/sagni/Downloads/Mental Health Detection/abcnews-date-text.csv")

# Add binary label based on presence of mental health keywords
keywords = ['depression', 'anxiety', 'suicide', 'mental', 'stress', 'sad', 'lonely', 'panic']
df['label'] = df['headline_text'].apply(lambda x: 1 if any(word in x.lower() for word in keywords) else 0)

# Clean text function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Apply cleaning
df['clean_text'] = df['headline_text'].apply(clean_text)

# TF-IDF vectorization (sparse)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])

# Labels
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression (memory-efficient)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluation
print("✅ Accuracy on test set:", clf.score(X_test, y_test))
print("\n🧾 Classification Report:\n", classification_report(y_test, clf.predict(X_test)))

# Save model and vectorizer
model_path = "C:/Users/sagni/Downloads/Mental Health Detection/logistic_model.joblib"
vec_path = "C:/Users/sagni/Downloads/Mental Health Detection/tfidf_vectorizer.joblib"
joblib.dump(clf, model_path)
joblib.dump(vectorizer, vec_path)
print(f"✅ Model saved to {model_path}")
print(f"✅ Vectorizer saved to {vec_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sagni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Accuracy on test set: 0.9976128951884166

🧾 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    246551
           1       1.00      0.74      0.85      2286

    accuracy                           1.00    248837
   macro avg       1.00      0.87      0.92    248837
weighted avg       1.00      1.00      1.00    248837

✅ Model saved to C:/Users/sagni/Downloads/Mental Health Detection/logistic_model.joblib
✅ Vectorizer saved to C:/Users/sagni/Downloads/Mental Health Detection/tfidf_vectorizer.joblib
