# **Sentiment Analysis for Mental Health**

### **Importing Dependencies**

In [37]:
# Print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message=".*no associated frequency information.*")
warnings.filterwarnings("ignore", message=".*Maximum Likelihood optimization failed to converge.*")

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplot
import seaborn as sns
import re

# NLTK Libraries
import nltk
from nltk.corpus import stopwords

# Scikit-Learn Libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Keras / Tensorflow Libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Input
from keras.initializers import Constant

### **Dataset Setup**

**Step 1: Setting the Directory**

In [2]:
# Base Directory
base_dir = '/Users/reemikadas/Desktop/SCU MSBA/Github/Data_Science/NLP_Projects/NLP | Sentiment Analysis for Mental Health'

# Data Directory
data_dir = os.path.join(base_dir, 'Data')
data_csv_path = os.path.join(data_dir, 'sentiment_health_cleaned.csv')

**Step 2: Loading and Reading the Datasets**

In [3]:
# Loading the DataFrame
df = pd.read_csv(data_csv_path)

# Glimpse of the first 5 rows
df.head()

Unnamed: 0,unique_id,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


### **Text Preprocessing [Lowercase, Stopwords Removal, Punctuations, Special Characters, Digits, URLS]**

**Example**

----------------------------------------------------------------------------

In [4]:
# Text sample
text = df['statement'].values[3]
text

"I've shifted my focus to something else but I'm still worried"

In [5]:
# Lowercase the sentence
text = text.lower()
text

"i've shifted my focus to something else but i'm still worried"

In [6]:
# Stopwords
stop_words = set(stopwords.words('english'))

# View 20 stopwords
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [7]:
# Remove Stopwords
text = ' '.join([w for w in text.split() if w not in stop_words])
text

'shifted focus something else still worried'

In [8]:
# Remove URLs (if any)
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text

'shifted focus something else still worried'

In [9]:
# Remove Special characters and numbers
text = re.sub(r'[^a-z\s]', '', text)
text

'shifted focus something else still worried'

In [10]:
# Remove Whitespace
text = re.sub(r'\s+', ' ', text).strip()
text

'shifted focus something else still worried'

---------------------------------------------------------------------------------------------------------------

Define function to apply text preprocessing on whole datasets.

In [11]:
# Text Preprocessing function

def text_preprocess(text):
    # Lowercase
    text = text.lower()

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [12]:
df['clean_text'] = df['statement'].apply(text_preprocess)

# Glimpse of the data
df.head(10)

Unnamed: 0,unique_id,statement,status,clean_text
0,0,oh my gosh,Anxiety,oh gosh
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,3,I've shifted my focus to something else but I'...,Anxiety,shifted focus something else still worried
4,4,"I'm restless and restless, it's been a month n...",Anxiety,restless restless month now boy mean
5,5,"every break, you must be nervous, like somethi...",Anxiety,every break must nervous like something wrong ...
6,6,"I feel scared, anxious, what can I do? And may...",Anxiety,feel scared anxious do may family us protected
7,7,Have you ever felt nervous but didn't know why?,Anxiety,ever felt nervous know why
8,8,"I haven't slept well for 2 days, it's like I'm...",Anxiety,slept well days like restless huh
9,9,"I'm really worried, I want to cry.",Anxiety,really worried want cry


In [13]:
# Convert the Sentiment Status text to numbers --> eg: 'Anxiety:0', 'Bipolar:1', etc.
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['status'])

# check mapping
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_map)

{'Anxiety': np.int64(0), 'Bipolar': np.int64(1), 'Depression': np.int64(2), 'Normal': np.int64(3), 'Personality disorder': np.int64(4), 'Stress': np.int64(5), 'Suicidal': np.int64(6)}


In [14]:
# Glimpse of the data
df.head()

Unnamed: 0,unique_id,statement,status,clean_text,label_encoded
0,0,oh my gosh,Anxiety,oh gosh,0
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...,0
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,0
3,3,I've shifted my focus to something else but I'...,Anxiety,shifted focus something else still worried,0
4,4,"I'm restless and restless, it's been a month n...",Anxiety,restless restless month now boy mean,0


In [15]:
# Keep the required columns (clean_text, label_encoded)
clean_df = df[['clean_text', 'label_encoded']]

# Glimpse of the Clean Df
clean_df.head()

Unnamed: 0,clean_text,label_encoded
0,oh gosh,0
1,trouble sleeping confused mind restless heart ...,0
2,wrong back dear forward doubt stay restless re...,0
3,shifted focus something else still worried,0
4,restless restless month now boy mean,0


### **Split the Data**

In [16]:
# Define independent (X) and dependent (y) variables
X = clean_df['clean_text']
y = clean_df['label_encoded']

# Split the data into Train and test Data

# Train + Temp (Val + Test)
X_train, X_temp, y_train, y_temp = train_test_split(
                                                    X, y,
                                                    test_size = 0.3,
                                                    stratify = y, random_state = 42
                                                    )

# Val + Test
X_val, X_test, y_val, y_test = train_test_split(
                                                    X_temp, y_temp,
                                                    test_size = 0.5,
                                                    stratify = y_temp, random_state = 42
                                                    )

# Print the size of train / val / test sets
print("Train Set: {}".format(len(X_train)))
print("Validation Set: {}".format(len(X_val)))
print("Test Set: {}".format(len(X_test)))

Train Set: 36876
Validation Set: 7902
Test Set: 7903


### **Text Preprocessing [Tokenization, Padding]**

**(a) Tokenization**

- **'total_words'** denotes the top maximum words size
- **'OOV'**: Stands for 'Out-of-Vocabulary'. These are words that don’t appear in the tokenizer’s vocabulary (i.e., words that weren’t seen during training or are beyond the num_words limit). Any word not in the top num_words gets replaced by OOV often represented as '1'.

In [17]:
# Tokenizer
total_words = 20000

tokenizer = Tokenizer(num_words = total_words, oov_token = "<OOV>")
tokenizer.fit_on_texts(X_train)

# Vocabulary Size
vocab_size = len(tokenizer.word_index)
print(vocab_size)

59722


In [18]:
# Print the words based high to low frequency
print(tokenizer.word_index)



In [19]:
# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Few Sample Texts (first 5 Statements)
print(X_train_seq[:5])

[[180, 1586, 145, 27, 6, 185, 624, 172, 232, 141, 5266, 453, 814, 1885, 1826, 21, 97, 55, 1170, 29, 131, 221, 2772, 2, 550, 719, 290, 27, 2, 278, 168, 27, 473, 212, 200, 295, 52, 130, 263, 2773, 41, 194, 17, 2, 80, 125, 143, 182, 9, 1184, 80, 187, 2230, 8639, 118, 150, 142, 143, 72, 269, 165, 2536, 245, 34, 118, 80, 790, 325, 45, 283, 470, 38, 502, 118, 4562, 612, 172, 535, 163, 1440, 1738, 34, 376, 34, 143, 118, 55, 41, 1420, 266, 34, 17, 143, 182, 2, 165, 1, 192, 93, 3, 2, 2, 172, 61, 165, 2, 501, 681, 143, 185, 7, 342, 567, 329, 139, 93, 417, 1325, 3, 73, 650, 17, 93, 34, 7, 158, 17, 143, 110, 7, 23, 491, 91, 1639, 126, 33, 1119, 2, 1977, 1119, 1269, 20, 719, 35, 491, 171, 146, 73, 2475, 126, 632, 314, 41, 222, 1229, 90, 9147, 159, 41, 470, 182, 17, 143], [1, 1, 1, 2095, 1], [57, 831, 619, 67, 190, 20, 9, 150, 2, 4, 47, 161, 8, 105, 4, 36, 752, 2, 436, 600, 35, 536, 8, 659, 5846, 495, 135, 18, 74, 5, 122, 850, 203, 1392, 43, 65, 1, 105, 40, 16, 115, 277], [424, 353], [48, 390, 148]]

In [20]:
word_to_id = tokenizer.word_index
id_to_word = {value:key for key, value in word_to_id.items()}

# Example
print(word_to_id['life'])
print(id_to_word[6])

6
life


**(b) Padding**

In [21]:
num_tokens = [len(tokens) for tokens in X_train_seq]
num_tokens = np.array(num_tokens)

print("Average Tokens per Statement: {}".format(np.mean(num_tokens)))
print("Max Tokens in a Statement: {}".format(np.max(num_tokens)))

Average Tokens per Statement: 53.22857685215316
Max Tokens in a Statement: 3780


In [22]:
# Maximum length of words in a Statements = Average + 2 * Standard Deviations
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("Maximum number of tokens per statement: {}".format(max_tokens))

Maximum number of tokens per statement: 208


In [23]:
np.sum(num_tokens < max_tokens)/ len(num_tokens) 

np.float64(0.962956936761037)

This covers about 96% of the data-set.

In [24]:
# Padding
X_train_pad = pad_sequences(X_train_seq,
                            maxlen = max_tokens,
                            padding = 'pre', # Adding 0 at the beginning (words < 208)
                            truncating = 'pre' # Removing words from the beginning (words > 208)
                           )

X_val_pad = pad_sequences(X_val_seq,
                          maxlen = max_tokens,
                          padding = 'pre', # Adding 0 at the beginning (words < 208)
                          truncating = 'pre' # Removing words from the beginning (words > 208)
                          )

X_test_pad = pad_sequences(X_test_seq,
                           maxlen = max_tokens,
                           padding = 'pre', # Adding 0 at the beginning (words < 208)
                           truncating = 'pre' # Removing words from the beginning (words > 208)
                          )

# Print the shape after padding
print("Train Data Shape after Padding: {}".format(X_train_pad.shape))
print("Validation Data Shape after Padding: {}".format(X_val_pad.shape))
print("Test Data Shape after Padding: {}".format(X_test_pad.shape))

Train Data Shape after Padding: (36876, 208)
Validation Data Shape after Padding: (7902, 208)
Test Data Shape after Padding: (7903, 208)


In [25]:
# First Statement (Original Clean Text)
X_train.iloc[0]

'put context though things life probably explain another post little summary kid nearly raped cousin never told family grew anxiety parents bit strict like physically harm saying things like stupid kind things child i understand mother also mental issues cured good relationship think like started thinking suicide sad time yo started self harming mitigate thoughts left sometimes suicide thought came mind studied college still thoughts started dating girl everything okay whenever got fight thoughts appeared moving another country start graduate studies still girlfriend still suicide thoughts family good terms close still think suicide sad like mind imagines place happy feel like like another live mind like oh yeah suicide probably get there despite fact actually happy whatever doing feel lot pressure think happy still get times think suicide tried get help psychiatrist first session wanted take lithium like wtf lithium causes much harm something psychiatrist used give lot patients wanted

In [26]:
# First Statement before Padding
np.array(X_train_seq[0])

array([ 180, 1586,  145,   27,    6,  185,  624,  172,  232,  141, 5266,
        453,  814, 1885, 1826,   21,   97,   55, 1170,   29,  131,  221,
       2772,    2,  550,  719,  290,   27,    2,  278,  168,   27,  473,
        212,  200,  295,   52,  130,  263, 2773,   41,  194,   17,    2,
         80,  125,  143,  182,    9, 1184,   80,  187, 2230, 8639,  118,
        150,  142,  143,   72,  269,  165, 2536,  245,   34,  118,   80,
        790,  325,   45,  283,  470,   38,  502,  118, 4562,  612,  172,
        535,  163, 1440, 1738,   34,  376,   34,  143,  118,   55,   41,
       1420,  266,   34,   17,  143,  182,    2,  165,    1,  192,   93,
          3,    2,    2,  172,   61,  165,    2,  501,  681,  143,  185,
          7,  342,  567,  329,  139,   93,  417, 1325,    3,   73,  650,
         17,   93,   34,    7,  158,   17,  143,  110,    7,   23,  491,
         91, 1639,  126,   33, 1119,    2, 1977, 1119, 1269,   20,  719,
         35,  491,  171,  146,   73, 2475,  126,  6

In [27]:
# First Statement after Padding
np.array(X_train_pad[0])

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,  180, 1586,  145,   27,    6,  185,  624,  172,  232,  141,
       5266,  453,  814, 1885, 1826,   21,   97,   55, 1170,   29,  131,
        221, 2772,    2,  550,  719,  290,   27,    2,  278,  168,   27,
        473,  212,  200,  295,   52,  130,  263, 2773,   41,  194,   17,
          2,   80,  125,  143,  182,    9, 1184,   80,  187, 2230, 8639,
        118,  150,  142,  143,   72,  269,  165, 2536,  245,   34,  118,
         80,  790,  325,   45,  283,  470,   38,  502,  118, 4562,  612,
        172,  535,  163, 1440, 1738,   34,  376,   34,  143,  118,   55,
         41, 1420,  266,   34,   17,  143,  182,    2,  165,    1,  192,
         93,    3,    2,    2,  172,   61,  165,   

### **Feature Extraction [Word2Vec Embedding - Custom]**

In [28]:
# Creating word list
X_train_wordlist = [[id_to_word[token] for token in review_tokens] for review_tokens in X_train_seq]
X_test_wordlist = [[id_to_word[token] for token in review_tokens] for review_tokens in X_test_seq]

# Print the first statement
print(X_train_wordlist[0])

['put', 'context', 'though', 'things', 'life', 'probably', 'explain', 'another', 'post', 'little', 'summary', 'kid', 'nearly', 'raped', 'cousin', 'never', 'told', 'family', 'grew', 'anxiety', 'parents', 'bit', 'strict', 'like', 'physically', 'harm', 'saying', 'things', 'like', 'stupid', 'kind', 'things', 'child', 'i', 'understand', 'mother', 'also', 'mental', 'issues', 'cured', 'good', 'relationship', 'think', 'like', 'started', 'thinking', 'suicide', 'sad', 'time', 'yo', 'started', 'self', 'harming', 'mitigate', 'thoughts', 'left', 'sometimes', 'suicide', 'thought', 'came', 'mind', 'studied', 'college', 'still', 'thoughts', 'started', 'dating', 'girl', 'everything', 'okay', 'whenever', 'got', 'fight', 'thoughts', 'appeared', 'moving', 'another', 'country', 'start', 'graduate', 'studies', 'still', 'girlfriend', 'still', 'suicide', 'thoughts', 'family', 'good', 'terms', 'close', 'still', 'think', 'suicide', 'sad', 'like', 'mind', '<OOV>', 'place', 'happy', 'feel', 'like', 'like', 'anoth

In [29]:
# Defining the dimension of embedding
emb_dim = 32

# Initiate the embedding model
emb_model = gensim.models.Word2Vec(
                                    sentences = X_train_wordlist,
                                    vector_size = emb_dim,
                                    window = 5, # 5 words before and after the concentrated word
                                    sg = 1, # Skip-grams order doesn't matter
                                  )

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [30]:
# Unique words learned by Word2Vec
len(list(emb_model.wv.key_to_index))

13569

In [31]:
# Example: Similar words wrt 'Mental'
emb_model.wv.most_similar('mental')

[('health', 0.9223236441612244),
 ('illness', 0.8671400547027588),
 ('illnesses', 0.845429003238678),
 ('deteriorating', 0.8392342925071716),
 ('declining', 0.8335012197494507),
 ('issuesi', 0.8199567794799805),
 ('deteriorated', 0.8198568224906921),
 ('professionals', 0.8110730051994324),
 ('struggles', 0.8092007040977478),
 ('issues', 0.8018561601638794)]

In [32]:
# Vector for word 'Mental'
emb_model.wv.get_vector('mental')

array([-0.45597315, -1.1682503 ,  0.24300416, -0.2569011 , -0.36377355,
       -0.38862866, -0.03923269,  0.7131887 , -0.35326797, -0.04084317,
        0.11066858, -0.72754675,  0.37980685, -0.38714114,  0.17921518,
       -0.06977737, -0.0227848 , -0.37315232, -0.11375482, -0.01515991,
        1.2692807 ,  0.41472718, -0.44012523, -0.4018503 ,  0.2562331 ,
        0.59069747,  0.15839998,  0.08688874,  0.09960798, -0.5656362 ,
       -0.02651503,  0.524574  ], dtype=float32)

In [34]:
# Embedding Matrix
emb_matrix = np.zeros((total_words, emb_dim))

for idx, key in enumerate(emb_model.wv.key_to_index):
    token_id = word_to_id[key]
    emb_matrix[token_id] = emb_model.wv.get_vector(key)

In [35]:
# Word 'Mental'
word_to_id['mental']

130

In [36]:
emb_matrix[130]

array([-0.45597315, -1.16825032,  0.24300416, -0.25690109, -0.36377355,
       -0.38862866, -0.03923269,  0.71318871, -0.35326797, -0.04084317,
        0.11066858, -0.72754675,  0.37980685, -0.38714114,  0.17921518,
       -0.06977737, -0.0227848 , -0.37315232, -0.11375482, -0.01515991,
        1.26928067,  0.41472718, -0.44012523, -0.40185031,  0.2562331 ,
        0.59069747,  0.15839998,  0.08688874,  0.09960798, -0.56563622,
       -0.02651503,  0.52457398])

### **Model Development**

#### **Recurrent Neural Network (LSTM)**

In [39]:
# --- LSTM Architecture ---
# 1. max_tokens = 208
# 2. total_words = 20000
# 3. emb_dim = 32
lstm_node_1 = 64
num_classes = clean_df['label_encoded'].nunique() # 7 classes

# Initiate the Model
lstm_model = Sequential()

# Input Layer
lstm_model.add(Input(shape = (max_tokens,)))

# Embedding Layer
lstm_model.add(Embedding(
                            input_dim = total_words,
                            output_dim = emb_dim,
                            embeddings_initializer = Constant(emb_matrix),
                            trainable = False,
                            name = 'embedding'
                        ))

# LSTM Layer 1
lstm_model.add(LSTM(units = lstm_node_1, return_sequences = False))

# Output Layer
lstm_model.add(Dense(num_classes, activation = 'softmax'))

# Print the Model Summary
lstm_model.summary()

**Evaluate on Test Data**

### Post Analysis

#### **Misclassification by the Model**

#### **Correct Classification by the Model**