# Depression Detection Using NLP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
## Import dataset and read that dataset

In [3]:
dataset=pd.read_csv("dataset.csv", encoding="latin-1")
dataset.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
## get the information of dataset

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [6]:
## check the null values of the dataset

In [7]:
dataset.isnull().sum()

Unnamed: 0      0
statement     362
status          0
dtype: int64

In [8]:
## Split the whole dataset into  train and test set

In [9]:
from sklearn.model_selection import train_test_split
trainset, testset=train_test_split(dataset, test_size=0.2, random_state=42)

In [10]:
trainset.shape, testset.shape

((42434, 3), (10609, 3))

In [11]:
trainset.head()

Unnamed: 0.1,Unnamed: 0,statement,status
52275,52275,How do you make eye contact with people? I rea...,Anxiety
49099,49099,"Balancing school, social life and work How to ...",Stress
10636,10636,weed made me motivated and happy again. motivated,Depression
23796,23796,24/7 i cringe at my past thoughts omg its so p...,Suicidal
48863,48863,,Stress


In [12]:
testset.head()

Unnamed: 0.1,Unnamed: 0,statement,status
22261,22261,Just as the the title says. I feel like one is...,Depression
41400,41400,a blackened sky encroached tugging behind it m...,Depression
20065,20065,"It gives you insomnia, which in turn makes you...",Depression
30036,30036,"Hello all, I'm a new submitter to this channel...",Normal
780,780,Thank God the CB is over for Eid,Normal


## Data Preprocessing

In [13]:
## Drop the unnecessary columns

In [14]:
new_train_set=trainset.drop('Unnamed: 0', axis=1)
new_train_set.head()

Unnamed: 0,statement,status
52275,How do you make eye contact with people? I rea...,Anxiety
49099,"Balancing school, social life and work How to ...",Stress
10636,weed made me motivated and happy again. motivated,Depression
23796,24/7 i cringe at my past thoughts omg its so p...,Suicidal
48863,,Stress


In [15]:
new_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42434 entries, 52275 to 15795
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  42157 non-null  object
 1   status     42434 non-null  object
dtypes: object(2)
memory usage: 994.5+ KB


In [16]:
##  check value counts of statement

In [17]:
new_train_set['status'].value_counts()

status
Normal                  13024
Depression              12304
Suicidal                 8635
Anxiety                  3109
Bipolar                  2297
Stress                   2112
Personality disorder      953
Name: count, dtype: int64

In [18]:
new_train_set['statement'].info()

<class 'pandas.core.series.Series'>
Index: 42434 entries, 52275 to 15795
Series name: statement
Non-Null Count  Dtype 
--------------  ----- 
42157 non-null  object
dtypes: object(1)
memory usage: 663.0+ KB


In [19]:
## Fill the missing values

In [20]:
statement_column=new_train_set['statement'].mode()[0]
statement_column

'what do you mean?'

In [21]:
new_train_set['statement']=new_train_set['statement'].fillna(statement_column)
new_train_set['statement'].head()

52275    How do you make eye contact with people? I rea...
49099    Balancing school, social life and work How to ...
10636    weed made me motivated and happy again. motivated
23796    24/7 i cringe at my past thoughts omg its so p...
48863                                    what do you mean?
Name: statement, dtype: object

In [22]:
##  check the null values of statement column after filling  the null values

In [23]:
new_train_set['statement'].isnull().sum()

0

In [24]:
new_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42434 entries, 52275 to 15795
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  42434 non-null  object
 1   status     42434 non-null  object
dtypes: object(2)
memory usage: 994.5+ KB


In [25]:
## text preprocessing

In [26]:
import string
import re

In [27]:
new_train_set['statement'].head()

52275    How do you make eye contact with people? I rea...
49099    Balancing school, social life and work How to ...
10636    weed made me motivated and happy again. motivated
23796    24/7 i cringe at my past thoughts omg its so p...
48863                                    what do you mean?
Name: statement, dtype: object

In [28]:
## convert the uppercase letters into lowercase letters

In [29]:
new_train_set['statement']=new_train_set['statement'].apply(lambda x: "  ".join(x.lower() for x in  x.split()))

In [30]:
new_train_set['statement'].head()

52275    how  do  you  make  eye  contact  with  people...
49099    balancing  school,  social  life  and  work  h...
10636    weed  made  me  motivated  and  happy  again. ...
23796    24/7  i  cringe  at  my  past  thoughts  omg  ...
48863                                 what  do  you  mean?
Name: statement, dtype: object

In [31]:
## Remoe thee links

In [32]:
new_train_set['statement']=new_train_set['statement'].apply(lambda x:" ".join(re.sub(r'https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [33]:
new_train_set['statement'].head()

52275    how do you make eye contact with people? i rea...
49099    balancing school, social life and work how to ...
10636    weed made me motivated and happy again. motivated
23796    24/7 i cringe at my past thoughts omg its so p...
48863                                    what do you mean?
Name: statement, dtype: object

In [34]:
## Remove punctuation marks

In [35]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [36]:
def remove_punctuation(text):
    for punctuations in string.punctuation:
        text=text.replace(punctuations,'')
    return text
new_train_set['statement']=new_train_set['statement'].apply(remove_punctuation)
new_train_set['statement']

52275    how do you make eye contact with people i real...
49099    balancing school social life and work how to d...
10636     weed made me motivated and happy again motivated
23796    247 i cringe at my past thoughts omg its so pa...
48863                                     what do you mean
                               ...                        
11284    i used to be a very confident cheerful person ...
44732    spanx except i missed last week s lee and now ...
38158    tl dr have been abused mentally and physically...
860                           do i sell the tutorial again
15795    i am not religious but do believe in god but t...
Name: statement, Length: 42434, dtype: object

In [37]:
## remove the numbers from text

In [38]:
new_train_set['statement']=new_train_set['statement'].str.replace('\d+','',regex=True)
new_train_set['statement'].head()

  new_train_set['statement']=new_train_set['statement'].str.replace('\d+','',regex=True)


52275    how do you make eye contact with people i real...
49099    balancing school social life and work how to d...
10636     weed made me motivated and happy again motivated
23796     i cringe at my past thoughts omg its so painf...
48863                                     what do you mean
Name: statement, dtype: object

In [39]:
## Remove stop words

In [40]:
import nltk

In [41]:
## download the stop words

In [42]:
nltk.download('stopwords', download_dir="model/stopwords/")

[nltk_data] Downloading package stopwords to model/stopwords/...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
## load the english language stopwords

In [44]:
with open("model/stopwords/corpora/stopwords/english", 'r') as file:
    stopwords=file.read().splitlines()

In [45]:
stopwords

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [46]:
new_train_set['statement']=new_train_set['statement'].apply(lambda x:' '.join(x for x in x.split() if x not in stopwords))

In [47]:
new_train_set['statement'].head()

52275    make eye contact people really struggle person...
49099    balancing school social life work balance scho...
10636                  weed made motivated happy motivated
23796    cringe past thoughts omg painful want die badl...
48863                                                 mean
Name: statement, dtype: object

In [48]:
## Remove the influectional forms from text

#### Inflectional forms mean that the same word can appear in different grammatical forms. To reduce this variation, we can use stemming or lemmatization. If the text dataset is very large and speed is important, stemming is usually preferred

In [49]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [50]:
new_train_set['statement']=new_train_set['statement'].apply(lambda x:' '.join(ps.stem(x) for x in x.split()))

In [51]:
new_train_set['statement'].head()

52275    make eye contact peopl realli struggl person a...
49099    balanc school social life work balanc school w...
10636                          weed made motiv happi motiv
23796    cring past thought omg pain want die badli cri...
48863                                                 mean
Name: statement, dtype: object

In [52]:
## print the  preprocessed trainset data

In [53]:
new_train_set.head()

Unnamed: 0,statement,status
52275,make eye contact peopl realli struggl person a...,Anxiety
49099,balanc school social life work balanc school w...,Stress
10636,weed made motiv happi motiv,Depression
23796,cring past thought omg pain want die badli cri...,Suicidal
48863,mean,Stress


In [54]:
## Convert the preprocessed data into numerical vectors

In [55]:
## Building the vocabulary

In [56]:
from collections import Counter
vocab=Counter()

In [57]:
vocab

Counter()

In [58]:
for sentence in new_train_set['statement']:
    vocab.update(sentence.split())

In [59]:
vocab

Counter({'feel': 36621,
         'like': 31690,
         'want': 26861,
         'get': 23458,
         'go': 21061,
         'know': 20392,
         'life': 18005,
         'time': 17079,
         'even': 15286,
         'think': 14088,
         'peopl': 13575,
         'would': 13498,
         'year': 13306,
         'day': 13144,
         'thing': 13098,
         'realli': 13074,
         'one': 12492,
         'tri': 12443,
         'cannot': 12344,
         'make': 12327,
         'depress': 11565,
         'im': 11033,
         'help': 10877,
         'work': 10768,
         'friend': 10603,
         'take': 9627,
         'much': 9561,
         'never': 9477,
         'live': 9461,
         'fuck': 8463,
         'start': 8360,
         'need': 7887,
         'could': 7843,
         'thought': 7761,
         'back': 7634,
         'anymor': 7599,
         'talk': 7497,
         'anxieti': 7370,
         'see': 7306,
         'anyth': 7222,
         'way': 7188,
         'still':

In [61]:
## check the length of vocabulary

In [62]:
len(vocab)

52368

In [63]:
## save the vocabulary

In [64]:
def save_vocabulary(text, file_name):
    data="\n".join(text)
    file=open(file_name,'w',encoding='latin-1')
    file.write(data)
    file.close()
save_vocabulary(vocab,'model/vocabulary.txt')

In [65]:
## seperate the target and independent variables from trainset

In [66]:
x=new_train_set['statement']
y=new_train_set['status']

In [67]:
x.head()

52275    make eye contact peopl realli struggl person a...
49099    balanc school social life work balanc school w...
10636                          weed made motiv happi motiv
23796    cring past thought omg pain want die badli cri...
48863                                                 mean
Name: statement, dtype: object

In [68]:
y.head()

52275       Anxiety
49099        Stress
10636    Depression
23796      Suicidal
48863        Stress
Name: status, dtype: object