In [2]:
import pandas as pd
import numpy as np 

In [3]:
df=pd.read_csv('mental_health.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [5]:
df=df.drop(columns=['Unnamed: 0'])

In [6]:
df

Unnamed: 0,text,title,target
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...
5952,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [7]:
df['text'][0]

'Welcome to /r/depression\'s check-in post - a place to take a moment and share what is going on and how you are doing. If you have an accomplishment you want to talk about (these shouldn\'t be standalone posts in the sub as they violate the "role model" rule, but are welcome here), or are having a tough time but prefer not to make your own post, this is a place you can share.\n\n-----\n\nOur subreddit rules are located in the sidebar (you can also always access them at https://www.reddit.com/r/depression/about/rules) - since all of them exist for important safety reasons, we ask everyone here to read and follow them. Please click \'report\' on any harmful content you see here - we always want to know and deal as soon as we can.\n\nWe also have several wikis there for help with finding and giving support:\n\nhttps://www.reddit.com/r/depression/wiki/what_is_depression provides guidance about what is and isn\'t a depressive disorder, guidance on the complex nature of the illnesses that a

In [8]:
df['title'][0]

'Regular check-in post, with information about our rules and wikis'

In [9]:
df=df.drop(columns=['title'])

In [10]:
df

Unnamed: 0,text,target
0,Welcome to /r/depression's check-in post - a p...,1
1,We understand that most people who reply immed...,1
2,Anyone else just miss physical touch? I crave ...,1
3,I’m just so ashamed. Everyone and everything f...,1
4,I really need a friend. I don't even have a si...,1
...,...,...
5952,I’ve (24M) dealt with depression/anxiety for y...,4
5953,"""I don't feel very good, it's like I don't be...",4
5954,"I can't sleep most of the nights, meds didn't ...",4
5955,"Hi, all. I have to give a presentation at work...",4


In [11]:
df.isnull().sum()

text      350
target      0
dtype: int64

In [12]:
df.shape

(5957, 2)

In [13]:
df=df.dropna()

In [14]:
df.shape

(5607, 2)

ok so now null values are dropped 

In [15]:
df['target'].value_counts()

target
1    1202
4    1144
0    1099
2    1085
3    1077
Name: count, dtype: int64

from the dataset its given that 
* 0 = Stress
* 1 = Depression
* 2 = Bipolar disorder
* 3 = Personality disorder
* 4 = Anxiety

and one more thing is that data is balanced in all the class so now the model will not be biased towards a particular class.

In [16]:
df.head()

Unnamed: 0,text,target
0,Welcome to /r/depression's check-in post - a p...,1
1,We understand that most people who reply immed...,1
2,Anyone else just miss physical touch? I crave ...,1
3,I’m just so ashamed. Everyone and everything f...,1
4,I really need a friend. I don't even have a si...,1


# Data Cleaning 

In [17]:
import re 
def clean_text(text):
    text=text.lower()
    text=re.sub(r"https\S+|www\S+"," ",text)  #it will reove all the links 
    text=re.sub(r"\S+@\S+"," ",text)   # it will remove all emails 
    text=re.sub(r"[^a-zA-Z]"," ",text) #after removing email, it will only keep words expect it it will remove it all 
    text=re.sub(r"\s+"," ",text).strip()#and for extraspaces we need only 1 extraspace
    return text
df['cleaned']=df['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned']=df['text'].apply(clean_text)


In [18]:
df['cleaned'][0]

'welcome to r depression s check in post a place to take a moment and share what is going on and how you are doing if you have an accomplishment you want to talk about these shouldn t be standalone posts in the sub as they violate the role model rule but are welcome here or are having a tough time but prefer not to make your own post this is a place you can share our subreddit rules are located in the sidebar you can also always access them at since all of them exist for important safety reasons we ask everyone here to read and follow them please click report on any harmful content you see here we always want to know and deal as soon as we can we also have several wikis there for help with finding and giving support provides guidance about what is and isn t a depressive disorder guidance on the complex nature of the illnesses that are usually grouped under the depression label and redirect information for common off topic issues offers information on the nature and value of peer suppor

now the data looks clean now we will perfom nlp operations 

# Natural Language Processing 

# Tokenization

In [22]:
#tokenization 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [23]:
df['tokens']=df['cleaned'].apply(word_tokenize)
df['tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens']=df['cleaned'].apply(word_tokenize)


0       [welcome, to, r, depression, s, check, in, pos...
1       [we, understand, that, most, people, who, repl...
2       [anyone, else, just, miss, physical, touch, i,...
3       [i, m, just, so, ashamed, everyone, and, every...
4       [i, really, need, a, friend, i, don, t, even, ...
                              ...                        
5952    [i, ve, m, dealt, with, depression, anxiety, f...
5953    [i, don, t, feel, very, good, it, s, like, i, ...
5954    [i, can, t, sleep, most, of, the, nights, meds...
5955    [hi, all, i, have, to, give, a, presentation, ...
5956    [it, s, not, about, being, scared, i, didn, t,...
Name: tokens, Length: 5607, dtype: object

In [24]:
df['tokens'][0]

['welcome',
 'to',
 'r',
 'depression',
 's',
 'check',
 'in',
 'post',
 'a',
 'place',
 'to',
 'take',
 'a',
 'moment',
 'and',
 'share',
 'what',
 'is',
 'going',
 'on',
 'and',
 'how',
 'you',
 'are',
 'doing',
 'if',
 'you',
 'have',
 'an',
 'accomplishment',
 'you',
 'want',
 'to',
 'talk',
 'about',
 'these',
 'shouldn',
 't',
 'be',
 'standalone',
 'posts',
 'in',
 'the',
 'sub',
 'as',
 'they',
 'violate',
 'the',
 'role',
 'model',
 'rule',
 'but',
 'are',
 'welcome',
 'here',
 'or',
 'are',
 'having',
 'a',
 'tough',
 'time',
 'but',
 'prefer',
 'not',
 'to',
 'make',
 'your',
 'own',
 'post',
 'this',
 'is',
 'a',
 'place',
 'you',
 'can',
 'share',
 'our',
 'subreddit',
 'rules',
 'are',
 'located',
 'in',
 'the',
 'sidebar',
 'you',
 'can',
 'also',
 'always',
 'access',
 'them',
 'at',
 'since',
 'all',
 'of',
 'them',
 'exist',
 'for',
 'important',
 'safety',
 'reasons',
 'we',
 'ask',
 'everyone',
 'here',
 'to',
 'read',
 'and',
 'follow',
 'them',
 'please',
 'click'

# Stopword removal 

In [25]:
#now after tokenization lets remove the stopwords 
from nltk.corpus import stopwords 
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [31]:
df['tokens_no_stop']=df['tokens'].apply(lambda x: [w for w in x if w not in stop_words])
df['tokens_no_stop']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens_no_stop']=df['tokens'].apply(lambda x: [w for w in x if w not in stop_words])


0       [welcome, r, depression, check, post, place, t...
1       [understand, people, reply, immediately, op, i...
2       [anyone, else, miss, physical, touch, crave, b...
3       [ashamed, everyone, everything, feels, far, aw...
4       [really, need, friend, even, single, best, fri...
                              ...                        
5952    [dealt, depression, anxiety, years, used, grea...
5953    [feel, good, like, belong, world, think, ever,...
5954                          [sleep, nights, meds, help]
5955    [hi, give, presentation, work, next, week, min...
5956    [scared, lock, door, something, really, scared...
Name: tokens_no_stop, Length: 5607, dtype: object

In [33]:
df['tokens_no_stop'][0]

['welcome',
 'r',
 'depression',
 'check',
 'post',
 'place',
 'take',
 'moment',
 'share',
 'going',
 'accomplishment',
 'want',
 'talk',
 'standalone',
 'posts',
 'sub',
 'violate',
 'role',
 'model',
 'rule',
 'welcome',
 'tough',
 'time',
 'prefer',
 'make',
 'post',
 'place',
 'share',
 'subreddit',
 'rules',
 'located',
 'sidebar',
 'also',
 'always',
 'access',
 'since',
 'exist',
 'important',
 'safety',
 'reasons',
 'ask',
 'everyone',
 'read',
 'follow',
 'please',
 'click',
 'report',
 'harmful',
 'content',
 'see',
 'always',
 'want',
 'know',
 'deal',
 'soon',
 'also',
 'several',
 'wikis',
 'help',
 'finding',
 'giving',
 'support',
 'provides',
 'guidance',
 'depressive',
 'disorder',
 'guidance',
 'complex',
 'nature',
 'illnesses',
 'usually',
 'grouped',
 'depression',
 'label',
 'redirect',
 'information',
 'common',
 'topic',
 'issues',
 'offers',
 'information',
 'nature',
 'value',
 'peer',
 'support',
 'mental',
 'health',
 'issues',
 'general',
 'lots',
 'guidan

In [37]:
#great now its time for stemming and lemmatization 
from nltk.stem import PorterStemmer,WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...


True

In [39]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

#stemming 
df['stem']=df['tokens_no_stop'].apply(lambda x : [stemmer.stem(w)for w in x ])

df['lemma']=df['stem'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x ])
df['lemma']=df['lemma'].apply(lambda x : ' '.join(x)) #joining the list that we get after lemmatization into words 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stem']=df['tokens_no_stop'].apply(lambda x : [stemmer.stem(w)for w in x ])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemma']=df['stem'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x ])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemma']=df['lemma'].apply(lambda x : ' '.joi

In [40]:
df['lemma'][0]

'welcom r depress check post place take moment share go accomplish want talk standalon post sub violat role model rule welcom tough time prefer make post place share subreddit rule locat sidebar also alway access sinc exist import safeti reason ask everyon read follow plea click report harm content see alway want know deal soon also sever wiki help find give support provid guidanc depress disord guidanc complex natur ill usual group depress label redirect inform common topic issu offer inform natur valu peer support mental health issu gener lot guidanc learn usual help give peer support ysk type rule violat frequent see interf peopl get safe relev support peopl break privat contact rule never trust anyon tri get privat convers respons post see help post show understand basic principl peer support especi select give help wiki explain role model e achiev advic post expert free zone peer support mean rule know internet cultur celebr brag achiev brag good intent noth like ever accept conte

# Vectorization 

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer =TfidfVectorizer()
X=vectorizer.fit_transform(df['lemma'])
y=df['target']

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score 

In [45]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

In [76]:
model = LogisticRegression(max_iter=250)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,250


In [77]:
y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.767379679144385

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.81      0.79       220
           1       0.71      0.71      0.71       241
           2       0.87      0.77      0.82       217
           3       0.74      0.81      0.78       215
           4       0.75      0.74      0.74       229

    accuracy                           0.77      1122
   macro avg       0.77      0.77      0.77      1122
weighted avg       0.77      0.77      0.77      1122



In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [79]:
rf = RandomForestClassifier(
    n_estimators=500,      # number of trees
    max_depth=None,        # let trees grow fully
    random_state=42,
)

In [80]:
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8083778966131907

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       220
           1       0.73      0.76      0.74       241
           2       0.95      0.83      0.88       217
           3       0.80      0.80      0.80       215
           4       0.78      0.80      0.79       229

    accuracy                           0.81      1122
   macro avg       0.81      0.81      0.81      1122
weighted avg       0.81      0.81      0.81      1122



In [81]:
# Label mapping
label_map = {
    0: "Stress",
    1: "Depression",
    2: "Bipolar disorder",
    3: "Personality disorder",
    4: "Anxiety"
}

In [82]:
examples = [
    "Yesterday I felt like I could conquer the world, today I can’t even get out of bed.",
    "I feel so overwhelmed with work and life right now.",
    "Lately I’ve lost interest in everything, I just feel empty.",
    "My mood swings are out of control, one moment I’m up, the next I’m down.",
    "I can’t seem to connect with people, I feel like no one understands me.",
    "My heart races and I keep worrying about the smallest things."
]

In [83]:
example_features = vectorizer.transform(examples)
predictions = rf.predict(example_features)

for text, label in zip(examples, predictions):
    print(f"Text: {text}\nPredicted Label: {label_map[label]}\n")

Text: Yesterday I felt like I could conquer the world, today I can’t even get out of bed.
Predicted Label: Personality disorder

Text: I feel so overwhelmed with work and life right now.
Predicted Label: Depression

Text: Lately I’ve lost interest in everything, I just feel empty.
Predicted Label: Stress

Text: My mood swings are out of control, one moment I’m up, the next I’m down.
Predicted Label: Stress

Text: I can’t seem to connect with people, I feel like no one understands me.
Predicted Label: Depression

Text: My heart races and I keep worrying about the smallest things.
Predicted Label: Stress



# Saving the model 

In [84]:
import joblib

# Save model with compression
joblib.dump(rf, "mental_health_model.joblib", compress=3)

# Save vectorizer with compression
joblib.dump(vectorizer, "mental_health_vectorizer.joblib", compress=3)


['mental_health_vectorizer.joblib']

In [88]:
model = joblib.load("mental_health_model.joblib")
vectorizer = joblib.load("mental_health_vectorizer.joblib")

# Example prediction
new_texts = ["I keep worrying about things that may never happen."]
new_features = loaded_vectorizer.transform(new_texts)
prediction = loaded_model.predict(new_features)

print("Prediction:", label_map[prediction[0]])

Prediction: Stress
