In [2]:
# 1️⃣ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# 2️⃣ Load TF-IDF features
tfidf_file = 'D:/mental_health_project/data/processed/TFIDF_Features.csv'
X = pd.read_csv(tfidf_file)

In [4]:
# 3️⃣ Load target labels
cleaned_file = 'D:/mental_health_project/data/processed/Combined_Data_Cleaned.csv'
df = pd.read_csv(cleaned_file)
y = df['status']  # Target variable

In [5]:
# 4️⃣ Encode target labels if needed
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Classes:", le.classes_)

Classes: ['Anxiety' 'Bipolar' 'Depression' 'Normal' 'Personality disorder' 'Stress'
 'Suicidal']


In [6]:
# 5️⃣ Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [7]:
# 6️⃣ Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [8]:
# 6️⃣ Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [9]:
# 7️⃣ Make predictions
y_pred = model.predict(X_test)

In [10]:

# 8️⃣ Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.744982868330886

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.72      0.77       723
           1       0.86      0.62      0.72       500
           2       0.68      0.71      0.69      3018
           3       0.83      0.96      0.89      3208
           4       0.83      0.32      0.46       179
           5       0.69      0.37      0.48       459
           6       0.66      0.63      0.65      2128

    accuracy                           0.74     10215
   macro avg       0.77      0.62      0.66     10215
weighted avg       0.74      0.74      0.74     10215


Confusion Matrix:
 [[ 519    7   94   80    3   16    4]
 [  10  309  104   49    4   10   14]
 [  51   22 2143  186    3   19  594]
 [  10    1   65 3066    1   18   47]
 [   3    5   61   36   57   10    7]
 [  36    8  102  107    1  168   37]
 [   2    9  603  163    0    3 1348]]


In [11]:
# 9️⃣ Optional: Save the trained model
import joblib
model_file = 'D:/mental_health_project/results/logistic_regression_model.pkl'
joblib.dump(model, model_file)
print(f"Trained model saved to: {model_file}")

Trained model saved to: D:/mental_health_project/results/logistic_regression_model.pkl


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import os

# Make sure results folder exists
os.makedirs("D:/mental_health_project/results", exist_ok=True)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)  # X_train = your training statements

# Save vectorizer
vectorizer_file = 'D:/mental_health_project/results/vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_file)
print(f"Vectorizer saved to: {vectorizer_file}")


Vectorizer saved to: D:/mental_health_project/results/vectorizer.pkl


In [25]:
import pandas as pd

# Load your cleaned CSV
df_cleaned = pd.read_csv("D:\mental_health_project\data\processed\Combined_Data_Cleaned.csv")  # use your actual path


In [27]:
print(df_cleaned['cleaned_statement'].isnull().sum())


79


In [28]:
df_cleaned['cleaned_statement'] = df_cleaned['cleaned_statement'].fillna("")


In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Load cleaned dataset
df_cleaned = pd.read_csv("D:\mental_health_project\data\processed\Combined_Data_Cleaned.csv")

# Strip any spaces from column names
df_cleaned.columns = df_cleaned.columns.str.strip()

# Drop rows with empty statements
df_cleaned = df_cleaned.dropna(subset=['cleaned_statement'])

# Features and labels
X_train_text = df_cleaned['cleaned_statement']
y_train = df_cleaned['status']  # target column

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_text)

# Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Save model and vectorizer
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['vectorizer.pkl']

In [37]:
import os
print(os.getcwd())


d:\mental_health_project\notebooks


In [38]:
import os
import joblib

# Define project-level results folder
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go one level up from notebooks
results_path = os.path.join(project_root, "results")

# Make sure results folder exists
os.makedirs(results_path, exist_ok=True)

# Save model and vectorizer in project-level results folder
joblib.dump(model, os.path.join(results_path, "model.pkl"))
joblib.dump(vectorizer, os.path.join(results_path, "vectorizer.pkl"))

print(f"Model and vectorizer saved successfully in {results_path}/")


Model and vectorizer saved successfully in d:\mental_health_project\results/


In [31]:
print(df_cleaned.columns)


Index(['post_id', 'statement', 'status', 'cleaned_statement'], dtype='object')


In [39]:
#model_file = 'D:/mental_health_project/results/logistic_regression_model.pkl'
#joblib.dump(model, model_file)
#print(f"Trained model saved to: {model_file}")


In [40]:
'''import joblib

# Load the vectorizer you saved earlier
vectorizer_file = 'D:/mental_health_project/results/vectorizer.pkl'
vectorizer = joblib.load(vectorizer_file)'''


"import joblib\n\n# Load the vectorizer you saved earlier\nvectorizer_file = 'D:/mental_health_project/results/vectorizer.pkl'\nvectorizer = joblib.load(vectorizer_file)"

In [None]:
# Transform test data using the trained vectorizer
#X_test_tfidf = vectorizer.transform(X_test)


In [42]:
#y_pred = model.predict(X_test_tfidf)


In [5]:
import pandas as pd

# Load your dataset (adjust path and filename)
df = pd.read_csv("D:/mental_health_project/data/processed/Combined_Data_Cleaned.csv")  # or wherever your processed data is

# Assume the column with text is 'text'
X_train = df['text']  # this becomes your training data


KeyError: 'text'

In [6]:
print(df.columns)

Index(['post_id', 'statement', 'status', 'cleaned_statement'], dtype='object')


In [8]:
# -----------------------------
# TF-IDF Vectorizer: Fit and Save
# -----------------------------
import pandas as pd
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# 1️⃣ Load your processed dataset
df = pd.read_csv("D:/mental_health_project/data/processed/Combined_Data_Cleaned.csv")

# 2️⃣ Use the correct text column
X_train = df['cleaned_statement']  # your training text

# 3️⃣ Create and fit the vectorizer
vectorizer = TfidfVectorizer()
X_train = df['cleaned_statement'].dropna()
vectorizer.fit(X_train)

# 4️⃣ Ensure the folder exists
os.makedirs("src/models", exist_ok=True)

# 5️⃣ Save the fitted vectorizer
with open("src/models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("TF-IDF vectorizer saved successfully at src/models/tfidf_vectorizer.pkl!")


TF-IDF vectorizer saved successfully at src/models/tfidf_vectorizer.pkl!


In [9]:
import os
print(os.path.abspath("src/models"))


d:\mental_health_project\notebooks\src\models


In [11]:
import os
os.getcwd()


'd:\\mental_health_project\\notebooks'

In [12]:
os.makedirs("src/models", exist_ok=True)


In [16]:
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
MODEL_DIR = os.path.join(BASE_DIR, "src", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

# Load processed data
df = pd.read_csv(os.path.join(BASE_DIR, "data", "processed", "Combined_Data_Cleaned.csv"))

# Drop rows with NaN in either text or label
df_clean = df.dropna(subset=['cleaned_statement', 'status'])
X_train = df_clean['cleaned_statement']
y_train = df_clean['status']

# Fit vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

# Train model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Save vectorizer
with open(os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)

# Save model
with open(os.path.join(MODEL_DIR, "trained_model.pkl"), "wb") as f:
    pickle.dump(model, f)

# Verify saved files
print("Saved files in src/models/:", os.listdir(MODEL_DIR))
print("Number of samples:", X_train.shape[0], "labels:", y_train.shape[0])


Saved files in src/models/: ['tfidf_vectorizer.pkl', 'trained_model.pkl']
Number of samples: 50994 labels: 50994


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
