In [102]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [103]:
pd.set_option('display.max_colwidth', None)

In [104]:
# read data, assign column names, skip header
data = pd.read_csv("depression_dataset_reddit_cleaned.csv", names=["text", "is_depression"], header=0)

In [105]:
# look at some samples
print("depression:\n" + 
      str(data[data["is_depression"] == 1].sample(n=3).text.tolist()))
print("not depression:\n" + 
      str(data[data["is_depression"] == 0].sample(n=3).text.tolist()))

depression:
['i ve been sad for a couple of year now this is because of my height i am already 9 year old m but i am stuck at i feel like i am not a man because of this hence why i am extremely sad and developed body dysmorphia is this reason of mine just or am i overreacting', 'it will be two year this november since my brother died from a fentanyl overdose this completely shook up the family dynamic i moved back home to be closer to them about a year before his death while i am happy i did get to spend his last birthday with him since he is gone and the family is all split up now i hate living here i used to make double doing the type of work that i do here where i last lived my job is actually financially draining me i am a caregiver aid for disabled kid and have been for over year but ever since my brother died i find myself in very dark place then i get really angry for a second because i know he is gone and never coming back then throw in the caregiver fatigue with the grieving d

In [106]:
# class balance
print(data[data["is_depression"] == 1].count())
print(data[data["is_depression"] == 0].count())

text             3831
is_depression    3831
dtype: int64
text             3900
is_depression    3900
dtype: int64


In [107]:
# the samples suggest, the id_depression texts are longer than the others. Is that really so?
# print average length, min length and max length of elements shows that length could be an important factor
print("depression:\n"
      "avg: " + str(sum(map(len, data[data["is_depression"] == 1].text))/float(len(data[data["is_depression"] == 1].text))) + "\n" +
      "min: " + str(min(map(len, data[data["is_depression"] == 1].text))) + "\n" +
      "max: " + str(max(map(len, data[data["is_depression"] == 1].text))) + "\n"
      )
print("not depression:\n"
      "avg: " + str(sum(map(len, data[data["is_depression"] == 0].text))/float(len(data[data["is_depression"] == 1].text))) + "\n" +
      "min: " + str(min(map(len, data[data["is_depression"] == 0].text))) + "\n" +
      "max: " + str(max(map(len, data[data["is_depression"] == 0].text))) + "\n"
      )

depression:
avg: 658.299138606108
min: 3
max: 19822

not depression:
avg: 70.97572435395458
min: 7
max: 144


In [108]:
def preprocess(text):
    # remove extra blanks
    re.sub(r'\t{2,}', ' ', text)
    # lowercase
    text = text.lower()
    # TODO other stuff
    return text
    
data["text"] = data["text"].apply(preprocess)

In [109]:
# feature extraction
# vectorizer does tokenization, data already lowercased
# https://scikit-learn.org/stable/modules/feature_extraction.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
X = data['text'].to_numpy()
y = data['is_depression'].to_numpy()
# Bag of words
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_BOW = vectorizer.fit_transform(X)

In [110]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y)

In [111]:
# train
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)

In [112]:
# evaluate
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
print(tn, fp, fn, tp, accuracy, f1)

746 243 56 888 0.8453181583031557 0.8559036144578313


In [113]:
# trying to encode the documents using TFIDF instead of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
# encode
vectorizer = TfidfVectorizer()
X_TFIDF = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_TFIDF, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
print(tn, fp, fn, tp, accuracy, f1)

565 432 15 921 0.768753233316089 0.8047182175622541


In [114]:
# this is a little experiment: how well would a model perform, which is only trained on length of text?
X = data["text"].apply(len).to_numpy().reshape(-1,1)
y = data["is_depression"].to_numpy()
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# it just thinks everything is not depression. so 50%. works horrible.
# we could try adding some more features if we feel like it, but not right now.
print(tn, fp, fn, tp, accuracy, f1)
print(np.unique(y_hat, return_counts=True))

934 0 999 0 0.4831867563372995 0.0
(array([0], dtype=int64), array([1933], dtype=int64))
