# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [1]:
!pip install happiestfuntokenizing
!pip install transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
import pickle
from os.path import exists
from multiprocessing import Pool

from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

from tqdm.auto import tqdm

import gensim
from gensim import corpora
from gensim.models import LdaMulticore
import multiprocessing

from transformers import RobertaTokenizer, RobertaModel
import torch


# from google.colab import drive
# drive.mount('/content/drive')

# FILEPATH = 'drive/MyDrive/CSCI 1460/Final Project/'
FILEPATH = './'

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [3]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

# Dictionary mapping symptoms to subreddits
symptom_classifier = {
    "Anger": ["Anger"],
    "Anhedonia": ["anhedonia", "DeadBedrooms"],
    "Anxiety": ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"],
    "Concentration deficit": ["DecisionMaking", "shouldi"],
    "Disordered eating": ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"],
    "Fatigue": ["chronicfatigue", "Fatigue"],
    "Loneliness": ["ForeverAlone", "lonely"],
    "Sad mood": ["cry", "grief", "sad", "Sadness"],
    "Self-loathing": ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"],
    "Sleep problem": ["insomnia", "sleep"],
    "Somatic complaint": ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"],
    "Suicidal thoughts and attempts": ["AdultSelfHarm", "selfharm", "SuicideWatch"],
    "Worthlessness": ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]
}

In [4]:
def load(filename):
  """Load pickle file as DataFrame"""
  with open(FILEPATH + filename, 'rb') as pkl_file:
    data = pickle.load(pkl_file)
  return pd.DataFrame(data)

In [5]:
def build_symptom(raw_df):
  # get all posts in depression subreddits
  symptom_df = raw_df.loc[
      (raw_df['subreddit'].isin(depression_subreddits))
      ]
  # # filter out posts under 10 words
  # symptom_df['text_len'] = symptom_df['text'].apply(len)
  # symptom_df = symptom_df.loc[posts_df['text_len'] >= 10]
  return symptom_df.reset_index(drop=True)

def build_control(raw_df, symptom_df):
  # get posts by authors of symptom dataset in other subreddits
  control_df = raw_df.loc[
      (raw_df['author'].isin(symptom_df['author'])) &
      (~raw_df['subreddit'].isin(depression_subreddits))
      ]

  # # filter out posts under 10 words
  # control_df['text_len'] = control_df['text'].apply(len)
  # control_df = control_df.loc[posts_df['text_len'] >= 10]

  # only posts at least 180 days before author's first depression post
  min_dates = symptom_df[['author', 'created_utc']].groupby('author').min()
  min_dates.rename(columns={'created_utc': 'min_date'}, inplace=True)
  min_dates['min_date'] = min_dates['min_date'] - (180*24*60*60)
  control_df = control_df.merge(min_dates, how="inner", on="author")

  return control_df.loc[control_df['created_utc'] <= control_df['min_date']].reset_index(drop=True)


def dataset_generation():
  """Build control and symptom datasets"""
  # load raw data and filter out 'deleted' authors
  rawdata = load("student.pkl")
  rawdata = rawdata.loc[rawdata['author'] != '[deleted]'].reset_index(drop=True)

  symptom_df = build_symptom(rawdata)

  control_df = build_control(rawdata, symptom_df)

  return symptom_df, control_df

In [6]:
tokenizer = Tokenizer()
def tokenize_sentence(sentence):
  """tokenize a sentence"""
  return tokenizer.tokenize(sentence)

def tokenize_in_parallel(corpus):
  """tokenize an entire corpus (in parallel)"""
  with Pool(processes=4) as pool:
    results = list(tqdm(pool.imap(tokenize_sentence, corpus), total=len(corpus)))

  return results

Build symptom & control datasets and tokenize all the documents. Save/load the dataset.

In [7]:
if exists(f"{FILEPATH}dataset.pkl"):
    dataset = load("dataset.pkl")
else:
  symptom_df, control_df = dataset_generation()
  dataset = pd.concat([symptom_df[['text', 'subreddit']], control_df[['text', 'subreddit']]], ignore_index=True)
  dataset['tokenized'] = tokenize_in_parallel(dataset['text'])

  with open(f"{FILEPATH}dataset.pkl", 'wb') as f:
    pickle.dump(dataset, f)
    f.close()

100%|████████████████████████████████████████████████████████████████████████████████| 98536/98536 [00:36<00:00, 2689.90it/s]


In [8]:
def stop_words():
  """Find top 100 words from Reddit dataset to use as stop words"""
  reddit_data = load("student.pkl")

  # tokenize entire corpus
  tokenizer = Tokenizer()
  reddit_data['tokenized'] = reddit_data['text'].apply(lambda text: " ".join([t for t in tokenizer.tokenize(text) if t.isalpha()]))

  # Create a document-term matrix using CountVectorizer
  vectorizer = CountVectorizer(max_features=100)
  dtm = vectorizer.fit_transform(reddit_data['tokenized'])

  # Find the most frequent words
  word_counts = dtm.sum(axis=0)
  word_counts = np.array(word_counts).flatten()
  word_indices = word_counts.argsort()[::-1]

  # Extract the top 100 words
  return [vectorizer.get_feature_names_out()[i] for i in word_indices[:100]]

Load/build a list of stopwords for the reddit dataset. Stopwords are the top 100 words in the entire reddit dataset.

In [9]:
if exists(FILEPATH + "stopwords.txt"):
  stopwords = []
  with open(FILEPATH + "stopwords.txt", 'r') as file:
    for line in file:
      stopwords.append(line.strip())
else:
  stopwords = stop_words()
  with open(FILEPATH + "stopwords.txt", 'w') as file:
    for string in stopwords:
      file.write(string + '\n')
stopwords = set(stopwords)

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [10]:
def lda_vectorizer(tokenized_docs, num_topics=200):
  """
  Builds list of 200 dimensional LDA vectors for a corpus
  Guaranteed that length of output == length of input
  """
  dictionary = corpora.Dictionary(tokenized_docs)
  corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
  print("Built Corpus")

  lda_model = LdaMulticore(
      corpus,
      num_topics=num_topics,
      id2word=dictionary,
      # passes=10,
      # alpha=5,
      workers=multiprocessing.cpu_count()
      )
  print("Built LDA Model")

  vectors = []
  print("Building Vectors...")
  for doc_bow in tqdm(corpus):
      if len(doc_bow) > 0:
        topic_vector = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
        probability_vector = np.array([topic_prob for _, topic_prob in topic_vector])
        vectors.append(probability_vector)
      else:
        vectors.append(None)

  return vectors

Remove stopwords and build LDA vectors. Load/save as appropriate



In [18]:
if exists(FILEPATH + 'lda_vectors.pkl'):
  np.load(f"{FILEPATH}lda_vectors.pkl", allow_pickle=True)
else:
  no_stop = []
  for sentence in dataset['tokenized']:
    no_stop.append([word for word in sentence if word not in stopwords])

  lda_vectors = lda_vectorizer(no_stop)

  with open(f"{FILEPATH}lda_vectors.pkl", 'wb') as f:
    pickle.dump(lda_vectors, f)
    f.close()

## RoBERTa Embeddings

Use GPU if possible

In [19]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [20]:
def roberta_vectorizer(tokenized_docs):
  """
  Builds list of roBERTa vectors for a corpus
  Guaranteed that length of output == length of input
  """
  model_name = "roberta-base"
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
  model = RobertaModel.from_pretrained(model_name)
  model.to(device)

  vectors = []

  for doc in tqdm(tokenized_docs):
    text = " ".join(doc)
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    tokens.to(device)

    # Encode the post text with RoBERTa
    with torch.no_grad():
      outputs = model(**tokens)

    hidden_states = outputs.last_hidden_state

    vectors.append(torch.mean(hidden_states, dim=1).squeeze())

  return vectors

Build roBERTa vectors. Load/save as appropriate

In [None]:
if exists(FILEPATH + 'roberta_vectors.pkl'):
  np.load(f"{FILEPATH}roberta_vectors.pkl", allow_pickle=True)
else:
  roberta_vectors = roberta_vectorizer(dataset['tokenized'])

  roberta_vectors = [vector.cpu().numpy() for vector in roberta_vectors]
    
  with open(f"{FILEPATH}roberta_vectors.pkl", 'wb') as f:
    pickle.dump(roberta_vectors, f)
    f.close()

vocab.json: 100%|█████████████████████████████████████████████████████████████████████████| 899k/899k [00:00<00:00, 33.7MB/s]
merges.txt: 100%|█████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 24.6MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████| 1.36M/1.36M [00:00<00:00, 27.9MB/s]
config.json: 100%|██████████████████████████████████████████████████████████████████████████| 481/481 [00:00<00:00, 2.12MB/s]
model.safetensors: 100%|███████████████████████████████████████████████████████████████████| 499M/499M [00:04<00:00, 115MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|███████████████▌                                                          

Print lengths of LDA and roBERTa vector lists. Used as sanity check of data preprocessing and vectorization.

In [None]:
for symptom, subreddits in symptom_classifier.items():
  symptom_idx = dataset.loc[dataset['subreddit'].isin(subreddits)].index

  symptom_lda = [lda_vectors[i] for i in symptom_idx if lda_vectors[i] is not None]
  symptom_roberta = [roberta_vectors[i] for i in symptom_idx]

  print(symptom)
  print("LDA", len(symptom_lda))
  print("roBERTa", len(symptom_roberta))
  print()

control_idx = dataset.loc[~dataset['subreddit'].isin(depression_subreddits)].index

control_lda = [lda_vectors[i] for i in control_idx if lda_vectors[i] is not None]
control_roberta = [roberta_vectors[i] for i in control_idx]

print("Control")
print("LDA", len(control_lda))
print("roBERTa", len(control_roberta))
print()

## Main

In [None]:
def main(X, y):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """

  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True)
  results = cross_validate(
      rf_classifier,
      X=X, y=y, cv=cv,
      scoring='roc_auc',
      return_train_score=True
      )

  # Print training and testing scores
  train_score = sum(results['train_score']) / 5
  test_score = sum(results['test_score']) / 5
  print(f"Training Score: {train_score:.4f}")
  print(f"Testing Score: {test_score:.4f}")
  return train_score, test_score


In [None]:
# UNUSED
def supplement_symptoms(symptom):
  """Uses keywords to supplement symptoms with very few posts"""
  additions = False
  if symptom == 'Anger':
    additions = (
        (dataset['subreddit'].isin(depression_subreddits)) &
        (dataset['text'].str.contains('anger')))
  elif symptom == 'Concentration deficit':
    additions = (
        (dataset['subreddit'].isin(depression_subreddits)) &
         ((dataset['text'].str.contains('concentrate')) |
          (dataset['text'].str.contains('focus'))))
  elif symptom == 'Fatigue':
    additions = (
        (dataset['subreddit'].isin(depression_subreddits)) &
         ((dataset['text'].str.contains('fatigue')) |
          (dataset['text'].str.contains('tired'))))
  return additions

Train and evaluate Random Forest Classifier for each symptom in the dataset (exclude symptoms with very few posts)

In [None]:
results = []

control_idx = dataset.loc[~dataset['subreddit'].isin(depression_subreddits)].index

control_lda = [lda_vectors[i] for i in control_idx if lda_vectors[i] is not None]
control_roberta = [roberta_vectors[i] for i in control_idx]

for symptom, subreddits in tqdm(symptom_classifier.items()):
  if symptom in ['Fatigue', 'Concentration deficit']:
    continue

  symptom_idx = dataset.loc[
      dataset['subreddit'].isin(subreddits)
      #  | supplement_symptoms(symptom)
      ].index

  symptom_lda = [lda_vectors[i] for i in symptom_idx if lda_vectors[i] is not None]
  symptom_roberta = [roberta_vectors[i] for i in symptom_idx]


  X_lda = control_lda + symptom_lda
  X_roberta = control_roberta + symptom_roberta
  y_lda = [0] * len(control_lda) + [1] * len(symptom_lda)
  y_roberta = [0] * len(control_roberta) + [1] * len(symptom_roberta)

  print(symptom, "LDA")
  _, lda_score = main(X_lda, y_lda)
  print()
  print(symptom, "roBERTa")
  _, roberta_score = main(X_roberta, y_roberta)
  print()

  results.append({
      "Symptom": symptom,
      "LDA": lda_score,
      "roBERTa": roberta_score
  })

Display results

In [None]:
result_df = pd.DataFrame(results)
result_df.to_csv(FILEPATH + "results.csv")
result_df