In [57]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

import nltk, keras, string, html, math
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter, defaultdict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [58]:
pd.set_option('display.max_colwidth', None)

In [59]:
# read data, assign column names, skip header
data = pd.read_csv("depression_dataset_reddit_cleaned.csv", names=["text", "is_depression"], header=0)

In [60]:
# look at some samples
print("depression:\n" + 
      str(data[data["is_depression"] == 1].sample(n=3).text.tolist()))
print("not depression:\n" + 
      str(data[data["is_depression"] == 0].sample(n=3).text.tolist()))

depression:
['frasar9 tonyrouf edgar ojwang njeriwanyina superiority take your depression else where how you keep a conversation with a person with inferiority complex explaining your mental status wee go and cut tree', 'ha anyone used benadryl for anxiety attack i started using this because i refuse to be on benzos and find it work just enough to make it more manageable and wanted to know if others did too', 'thinking of starting it i have bad anxiety and think i have situational depression what s everyone s favorite medication and why i m nervous to start something for fear i ll gain weight or lose my sex drive something that doesn t usually cause that would be ideal']
not depression:
['still have a lot of work to catch up on', 'in bed i suddenly feel i wish ma wa here w me goodnight twitterfam', 'tomfelton late night suckkk gym always make me feel better though photoshoot for what']


In [61]:
# class balance
print(data[data["is_depression"] == 1].count())
print(data[data["is_depression"] == 0].count())

text             3831
is_depression    3831
dtype: int64
text             3900
is_depression    3900
dtype: int64


In [62]:
# the samples suggest, the id_depression texts are longer than the others. Is that really so?
# print average length, min length and max length of elements shows that length could be an important factor
print("depression:\n"
      "avg: " + str(sum(map(len, data[data["is_depression"] == 1].text))/float(len(data[data["is_depression"] == 1].text))) + "\n" +
      "min: " + str(min(map(len, data[data["is_depression"] == 1].text))) + "\n" +
      "max: " + str(max(map(len, data[data["is_depression"] == 1].text))) + "\n"
      )
print("not depression:\n"
      "avg: " + str(sum(map(len, data[data["is_depression"] == 0].text))/float(len(data[data["is_depression"] == 1].text))) + "\n" +
      "min: " + str(min(map(len, data[data["is_depression"] == 0].text))) + "\n" +
      "max: " + str(max(map(len, data[data["is_depression"] == 0].text))) + "\n"
      )

depression:
avg: 658.299138606108
min: 3
max: 19822

not depression:
avg: 70.97572435395458
min: 7
max: 144


In [63]:
def preprocess(text):
    # remove extra blanks
    re.sub(r'\t{2,}', ' ', text)
    # lowercase
    text = text.lower()
    # TODO other stuff
    return text
    
data["text"] = data["text"].apply(preprocess)

In [64]:
# feature extraction
# vectorizer does tokenization, data already lowercased
# https://scikit-learn.org/stable/modules/feature_extraction.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
X = data['text'].to_numpy()
y = data['is_depression'].to_numpy()
# Bag of words
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_BOW = vectorizer.fit_transform(X)

In [65]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y, stratify=y)

In [66]:
# train
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)

In [67]:
# evaluate
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
print(tn, fp, fn, tp, accuracy, f1)

743 232 45 913 0.8566994309363684 0.8682834046600095


In [68]:
# trying to encode the documents using TFIDF instead of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
# encode
vectorizer = TfidfVectorizer()
X_TFIDF = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_TFIDF, y, stratify=y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
print(tn, fp, fn, tp, accuracy, f1)

555 420 17 941 0.7739265390584583 0.8115567054764986


In [69]:
# this is a little experiment: how well would a model perform, which is only trained on length of text?
X = data["text"].apply(len).to_numpy().reshape(-1,1)
y = data["is_depression"].to_numpy()
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# it just thinks everything is not depression. so 50%. works horrible.
# we could try adding some more features if we feel like it, but not right now.
print(tn, fp, fn, tp, accuracy, f1)
print(np.unique(y_hat, return_counts=True))

975 0 958 0 0.5043973098810139 0.0
(array([0]), array([1933]))


In [70]:
# TODO optional: word2vec for naive bayes
# TODO optional: spacy similarties cause it's fun
# TODO optional: spacy transformer cause it's cool

In [71]:
# LSTM
# using this resource
# https://towardsdatascience.com/naive-bayes-and-lstm-based-classifier-models-63d521a48c20

In [72]:
X = data['text'].to_numpy()
y = data['is_depression'].to_numpy()
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [73]:
# Hyperparameters of the model
oov_tok = '<OOK>'
embedding_dim = 100
max_length = 150
padding_type='post'
trunc_type='post'

# tokenizes sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# vocabulary size
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1

# converts train dataset to sequence and pads sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

# converts Test dataset to sequence and pads sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [74]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# compiles model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 100)          1609400   
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 24)                3096      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1697001 (6.47 MB)
Trainable params: 1697001 (6.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [75]:
#training the model
num_epochs = 5
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [76]:
# Gets probabilities
prediction = model.predict(test_padded)
print("The probabilities are - ", prediction, sep='\n')

# Gets labels based on probability 1 if p>= 0.5 else 0
for each in prediction:
    if each[0] >=0.5:
        each[0] = 1
    else:
        each[0] = 0
prediction = prediction.astype('int32') 
print("\nThe labels are - ", prediction, sep='\n')

# Calculates accuracy on Test data
print("\nThe accuracy of the model is ", accuracy_score(y_test, prediction))
print("\nThe accuracy and other metrics are \n", classification_report(y_test, prediction, labels=[0, 1]),sep='\n')

The probabilities are - 
[[9.9991107e-01]
 [1.7032092e-02]
 [1.1283369e-03]
 ...
 [1.9983333e-04]
 [3.2477101e-04]
 [9.9999464e-01]]

The labels are - 
[[1]
 [0]
 [0]
 ...
 [0]
 [0]
 [1]]

The accuracy of the model is  0.958613554061045

The accuracy and other metrics are 

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       975
           1       0.97      0.95      0.96       958

    accuracy                           0.96      1933
   macro avg       0.96      0.96      0.96      1933
weighted avg       0.96      0.96      0.96      1933


In [77]:
# trying some recent posts I am getting from r/depression and non depression related subreddits
sentence = ["Our most-broken and least-understood rules is \"helpers may not invite private contact as a first resort\", so we've made a new wiki to explain it", 
            "Idk why but everyone seems depressed these days (including me).. Is it a phase that everyone goes through? Or is it just that our generation is fucked?", 
            "For everyone that needs to hear it I love you no matter what and just keep up the hard work. Stuff will get better in the near future so keep your heads up",
            "Guys I have a 2 in 1 laptop, Fedora was working incredibly from a USB Flashdrive compared to other distros, it was fast and everything worked just fine, but then I updated it and it became really slow, almost unusable... Any recommendations?"]

# converts to a sequence
test_sequences = tokenizer.texts_to_sequences(sentence)

# pads the sequence
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# Gets probabilities
prediction = model.predict(test_padded)
print("The probabilities are - ", prediction, sep='\n')

# Gets labels based on probability 1 if p>= 0.5 else 0
for each in prediction:
    if each[0] >=0.5:
        each[0] = 1
    else:
        each[0] = 0
prediction = prediction.astype('int32') 
print("\nThe labels are - ", prediction, sep='\n')

The probabilities are - 
[[7.0789293e-04]
 [9.9979615e-01]
 [9.7954142e-01]
 [9.9921525e-01]]

The labels are - 
[[0]
 [1]
 [1]
 [1]]
