In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score

In [3]:
df = pd.read_csv('/Users/aadityajoshi/Downloads/depression_dataset_reddit_cleaned.csv')

In [7]:
df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7731 entries, 0 to 7730
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_text     7731 non-null   object
 1   is_depression  7731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 120.9+ KB


In [4]:
from ydata_profiling import ProfileReport
report = ProfileReport(df)
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Dropping Null,blanks and duplicates

In [3]:
df.dropna(inplace=True)

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
blanks = []
for i,text,lbl in df.itertuples():
    if type(text) == str:
        if text.isspace():
            blanks.append(i)

In [18]:
blanks

[]

In [19]:
df['len'] = df['clean_text'].apply(len)

In [30]:
df['len'].describe()

count     7650.000000
mean       358.488497
std        685.553453
min          3.000000
25%         58.000000
50%        109.000000
75%        361.750000
max      19822.000000
Name: len, dtype: float64

In [23]:
df['clean_text'][10]

've struggling depression long time severe instance depersonalization scared badly m entirely sure fact spent hour lying floor feeling like body didn t belong real extremely unsettling feel mentally physically trying process happened difficult ha kind comfort advice depersonalization d appreciate'

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7650 entries, 0 to 7730
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_text     7650 non-null   object
 1   is_depression  7650 non-null   int64 
 2   len            7650 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 239.1+ KB


### Removing stopwords and punctuation

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [7]:
def clean_text(text):
    doc = nlp(text)
    return ' '.join([token.text.lower()
                     for token in doc if not token.is_stop and not token.is_punct])

In [8]:
df['clean_text'] = df['clean_text'].apply(clean_text)

### Using Stemmed as well as lemmatized form to see which works better

In [8]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['stem_text'] = df['clean_text'].apply(lambda txt : stemmer.stem(txt))

In [9]:
#lemmatization
def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [10]:
df['lemma_text'] = df['clean_text'].apply(lemmatize)

#### Splitting texts

In [13]:
text_raw = df['clean_text']
text_stem = df['stem_text']
text_lemm = df['lemma_text']
label = df['is_depression']

In [14]:
text_raw_train,text_raw_test,label_train,label_test = train_test_split(text_raw,label,test_size=0.25,random_state=42)

In [15]:
text_stem_train,text_stem_test,label_train,label_test = train_test_split(text_stem,label,test_size=0.25,random_state=42)

In [16]:
text_lemm_train,text_lemm_test,label_train,label_test = train_test_split(text_lemm,label,test_size=0.25,random_state=42)

#### LSTM

In [20]:
max_len = df[df['len'] == df['len'].max()]['len'].values[0]
max_len

19822

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_lemm_train)

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def seq_padding(tokenizer,text,padding,max_len):
    sequence = tokenizer.texts_to_sequences(text)
    pad_seq = pad_sequences(sequence,truncating=padding,maxlen=max_len)
    return pad_seq

In [23]:
text_lemm_train = seq_padding(tokenizer,text_lemm_train,'post',max_len)
text_lemm_test = seq_padding(tokenizer,text_lemm_test,'post',max_len)

In [24]:
vocabulary_size = len(tokenizer.word_counts)

In [25]:
df['lemm_len'] = df['lemma_text'].apply(len)
df['lemm_len'].describe()

count     7731.000000
mean       350.440047
std        672.559290
min          3.000000
25%         56.000000
50%        107.000000
75%        351.500000
max      19203.000000
Name: lemm_len, dtype: float64

In [26]:
def create_model_2(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len))
    model.add(LSTM(16))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(4,activation='relu'))
    model.add(Dense(2,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [52]:
model_lstm = create_model_2(vocabulary_size,100)

In [53]:
model_lstm.fit(text_lemm_train,label_train,batch_size=32,epochs=3)

Epoch 1/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 739ms/step - accuracy: 0.7938 - loss: 0.5532
Epoch 2/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 733ms/step - accuracy: 0.9655 - loss: 0.3360
Epoch 3/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 736ms/step - accuracy: 0.9753 - loss: 0.2841


<keras.src.callbacks.history.History at 0x417252250>

In [54]:
prediction = (model_lstm.predict(text_lemm_test)>0.5).astype("int32")

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 208ms/step


In [55]:
print(classification_report(label_test,prediction))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       962
           1       0.98      0.95      0.96       951

    accuracy                           0.97      1913
   macro avg       0.97      0.97      0.97      1913
weighted avg       0.97      0.97      0.97      1913



In [61]:
def create_model_3(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len))
    model.add(Bidirectional(LSTM(16)))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(4,activation='relu'))
    model.add(Dense(2,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [62]:
model_bilstm = create_model_3(vocabulary_size,128)

In [64]:
model_bilstm.fit(text_lemm_train,label_train,batch_size=32,epochs=2)

Epoch 1/2
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 1s/step - accuracy: 0.8193 - loss: 0.5274
Epoch 2/2
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 1s/step - accuracy: 0.9699 - loss: 0.3340


<keras.src.callbacks.history.History at 0x3e3530a90>

In [65]:
prediction_bilstm = (model_bilstm.predict(text_lemm_test) > 0.5).astype("int32")
print(classification_report(label_test,prediction_bilstm))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 358ms/step
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       962
           1       0.98      0.95      0.97       951

    accuracy                           0.97      1913
   macro avg       0.97      0.97      0.97      1913
weighted avg       0.97      0.97      0.97      1913



In [67]:
def create_model_GRU(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len))
    model.add(GRU(16))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(4,activation='relu'))
    model.add(Dense(2,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [68]:
model_gru = create_model_GRU(vocabulary_size,128)

In [69]:
model_gru.fit(text_lemm_train,label_train,batch_size=32,epochs=2)

Epoch 1/2
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 706ms/step - accuracy: 0.6916 - loss: 0.6225
Epoch 2/2
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 698ms/step - accuracy: 0.9690 - loss: 0.3401


<keras.src.callbacks.history.History at 0x4b4cba690>

In [71]:
prediction_gru = (model_gru.predict(text_lemm_test) > 0.5).astype("int32")

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 201ms/step


In [72]:
print(classification_report(label_test,prediction_gru))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       962
           1       0.98      0.94      0.96       951

    accuracy                           0.96      1913
   macro avg       0.97      0.96      0.96      1913
weighted avg       0.97      0.96      0.96      1913



### using spacy's built in vectorizer (Glove) as embedding layer

In [27]:
#creating a weight matrix
num_words = vocabulary_size + 1  
vector_size = nlp.vocab.vectors.shape[1]  
weight_matrix = np.zeros((num_words, vector_size))

In [28]:
for word, index in tokenizer.word_index.items():
    if index < num_words:
        if word in nlp.vocab:
            weight_matrix[index] = nlp(word).vector
        else:
            weight_matrix[index] = np.zeros(vector_size)

#### Creating new model with weight matrix

In [29]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.initializers import Constant

def create_model_wrd2vec(vocabulary_size, embedding_dim, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size,
                        output_dim=embedding_dim,
                        embeddings_initializer=Constant(embedding_matrix),
                        trainable=False))
    model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(4, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    model.summary()
    
    return model


In [30]:
model_wrd2vec_lstm = create_model_wrd2vec(vocabulary_size+1,vector_size,weight_matrix)

2024-06-22 14:06:44.093356: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-06-22 14:06:44.093377: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-06-22 14:06:44.093386: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-06-22 14:06:44.093400: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-22 14:06:44.093408: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [31]:
from tensorflow.keras.backend import clear_session
clear_session()
import tensorflow as tf

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus, 'GPU')
    except RuntimeError as e:
        print(e)

# Verify GPU configuration
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Log device placement
tf.debugging.set_log_device_placement(True)
with tf.device('/GPU:0'):
    model_wrd2vec_lstm.fit(text_lemm_train,label_train)

Physical devices cannot be modified after being initialized
Num GPUs Available:  1


2024-06-22 14:06:45.018572: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [40]:
label_train

<tf.Tensor: shape=(5737,), dtype=float32, numpy=array([1., 0., 0., ..., 1., 0., 0.], dtype=float32)>

In [39]:
label_train = np.array(label_train)
label_train = tf.convert_to_tensor(label_train, dtype=tf.float32)