# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [1]:
!pip install dlatk
!pip install happiestfuntokenizing
!pip install gensim
!pip install transformers

Collecting dlatk
  Downloading dlatk-1.2.7.tar.gz (20.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mysqlclient<=2.1.1 (from dlatk)
  Downloading mysqlclient-2.1.1.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.1/88.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nltk<=3.7,>=3.1 (from dlatk)
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<=1.23.1 (from dlatk)
  Downloading numpy-1.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting panda

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6710 sha256=999dc5fae0039f7acb6accb3d9cbaaa6400bbd2768d97550e4bd0e07248ec52c
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from collections import Counter
import gensim
import gensim.corpora as corpora
from transformers import RobertaTokenizer, RobertaModel
import torch
from gensim.models.ldamulticore import LdaMulticore
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer
from sklearn.metrics import roc_auc_score


from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/1460HW/student.pkl'

Mounted at /content/drive


## Preprocessing

In [3]:
def load(file_path):
    """
    Load and return the data from a pickle file.

    Parameters:
    file_path (str): The path to the .pkl file to be loaded.

    Returns:
    data: The data loaded from the .pkl file.
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

# call the function to display the dataset
data = load(FILEPATH)
data.head()

Unnamed: 0,text,author,subreddit,created_utc,date
0,does your life feel like a waste mines not a c...,trademeple,depression,1504920055,2017-09
1,Just relapsed again. Any advice I just got to ...,kenny818,NoFap,1507890053,2017-10
2,Audio and mic not working? So I have a HyperX ...,psyjinks,techsupport,1513558467,2017-12
3,PG&amp;E: Mylar balloon causes outage in centr...,Majnum,nottheonion,1499573023,2017-07
4,Um... Forward?,OldManoftheNorth,memes,1516842851,2018-01


In [4]:
# get the dataset shape
data.shape

(1958158, 5)

In [5]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

In [6]:
# Map the subreddits to each sympton
subreddit_to_sympton = {
    "Anger": "Anger",
    "anhedonia": "Anhedonia",
    "DeadBedrooms": "Anhedonia",
    "Anxiety": "Anxiety",
    "AnxietyDepression": "Anxiety",
    "HealthAnxiety": "Anxiety",
    "PanicAttack": "Anxiety",
    "DecisionMaking": "Concentration deficit",
    "shouldi": "Concentration deficit",
    "bingeeating": "Disordered eating",
    "BingeEatingDisorder": "Disordered eating",
    "EatingDisorders": "Disordered eating",
    "eating_disorders": "Disordered eating",
    "EDAnonymous": "Disordered eating",
    "chronicfatigue": "Fatigue",
    "Fatigue": "Fatigue",
    "ForeverAlone": "Loneliness",
    "lonely": "Loneliness",
    "cry": "Sad mood",
    "grief": "Sad mood",
    "sad": "Sad mood",
    "Sadness": "Sad mood",
    "AvPD": "Self-loathing",
    "SelfHate": "Self-loathing",
    "selfhelp": "Self-loathing",
    "selfhelp": "Worthlessness",
    "socialanxiety": "Self-loathing",
    "whatsbotheringyou": "Self-loathing",
    "whatsbotheringyou": "Worthlessness",
    "insomnia": "Sleep problem",
    "sleep": "Sleep problem",
    "cfs": "Somatic complaint",
    "ChronicPain": "Somatic complaint",
    "Constipation": "Somatic complaint",
    "EssentialTremor": "Somatic complaint",
    "headaches": "Somatic complaint",
    "ibs": "Somatic complaint",
    "tinnitus": "Somatic complaint",
    "AdultSelfHarm": "Suicidal thoughts and attempts",
    "selfharm": "Suicidal thoughts and attempts",
    "SuicideWatch": "Suicidal thoughts and attempts",
    "Guilt": "Worthlessness",
    "Pessimism": "Worthlessness",
}


# Custom function to get symptom
def get_symptom(subreddit):
    return subreddit_to_sympton.get(subreddit, 'Control')

# Use apply to create the new column 'symptom'
data['symptom'] = data['subreddit'].apply(get_symptom)



In [7]:
print(data["symptom"].unique())

['Control' 'Suicidal thoughts and attempts' 'Loneliness' 'Anxiety'
 'Anhedonia' 'Sleep problem' 'Self-loathing' 'Sad mood'
 'Somatic complaint' 'Disordered eating' 'Worthlessness' 'Anger'
 'Concentration deficit' 'Fatigue']


In [8]:
def dataset_generation(data, depression_subs):
    """
    Build control and symptom datasets.

    Parameters:
    - data: pandas DataFrame containing the Reddit posts data
    - depression_subs: list of subreddits related to depression symptoms

    Returns:
    - symptom_df: DataFrame containing posts from depression-related subreddits
    - control_df: DataFrame containing control posts that are at least 180 days older than the author's first depression-related post
    """
    # Filter out symptom dataset where symptom not equal to control
    symptom_df = data[data['symptom']!= 'Control']

     # Convert 'created_utc' to datetime
    data['date'] = pd.to_datetime(data['created_utc'], unit='s')

    # Identify authors with depression-related posts and find the earliest post's created_utc
    depression_authors = symptom_df.groupby('author')['created_utc'].min()

    # Map the minimum created_utc of depression posts for each author back to the original dataframe
    data['min_depression_utc'] = data['author'].map(depression_authors)

    # Calculate the time difference in days between each post and the earliest depression post for the same author
    data['days_diff'] = (data['created_utc'] - data['min_depression_utc']) / (60 * 60 * 24)

    # Filter for control posts that are at least 180 days older than the author's earliest depression-related post
    control_df = data[(data['symptom'] == 'Control') & (data['days_diff'] <= -180)]


    # Return the symptom and control dataframes
    return symptom_df, control_df


symptom_data, control_data = dataset_generation(data, depression_subreddits)



In [9]:
# Check the shape of symptom and control dataset
print('sympton dataset shape:', symptom_data.shape, 'control dataset shape:', control_data.shape)

sympton dataset shape: (94514, 6) control dataset shape: (4369, 8)


In [10]:
control_data.head()

Unnamed: 0,text,author,subreddit,created_utc,date,symptom,min_depression_utc,days_diff
315,"Man, I do love me some Bandicoot crash.",BuddermanTheAmazing,crappyoffbrands,1499236239,2017-07-05 06:30:39,Control,1517145000.0,-207.282731
651,How good is this PC for my 700-750$ budget? Wa...,WildernessExploring,buildmeapc,1501296261,2017-07-29 02:44:21,Control,1517346000.0,-185.764525
730,When is the price of gpus going down? I know t...,NeighborhoodPizzaGuy,pcmasterrace,1500082729,2017-07-15 01:38:49,Control,1516768000.0,-193.110938
1354,Our service is not available in your area. Hey...,xDEDANx,njpw,1499941432,2017-07-13 10:23:52,Control,1515534000.0,-180.474722
1598,Wow,baby_kicked,indianpeoplefacebook,1500924182,2017-07-24 19:23:02,Control,1517022000.0,-186.314271


In [11]:
symptom_data.head()

Unnamed: 0,text,author,subreddit,created_utc,date,symptom
20,"i'm trying hi, i'm sorry if my writing is bad,...",n90300118,SuicideWatch,1510374743,ression,Suicidal thoughts and attempts
39,Only friend has been blanking me for what feel...,Throwaway34qwas,lonely,1505308711,ression,Loneliness
67,Study hall social anxiety bruh We had a study ...,Shwin280,Anxiety,1515634258,ression,Anxiety
72,Positive Thoughts For You - We Are Happy To Pu...,pthinkimag,Anxiety,1515944819,ression,Anxiety
79,Starting from a blowup mattress Today was a ve...,MyCrazyLove,SuicideWatch,1516594948,ression,Suicidal thoughts and attempts


In [12]:
# Assuming symptom_data is your DataFrame and it has more than 900 rows
#symptom_data = symptom_data.sample(n=200, random_state=42)
#control_data = control_data.sample(n=100, random_state=42)

In [13]:
# Tokenize the text data

def tokenize(data):

  """Tokenize each message in the DataFrame."""

  tokenizer = Tokenizer(preserve_case=False)

  # Apply tokenization to each message
  data['tokens'] = data['text'].apply(tokenizer.tokenize)

  return data


In [14]:
# Find the top 100 words as stop word
def stop_words(data, n=100):
    """
    Find top n words from Reddit dataset to use as stop words.

    Parameters:
    - data: pandas DataFrame containing the Reddit posts data.
    - n: Number of top words to return. Default is 100.

    Returns:
    - top_n_words: List containing the top n words.
    """
    # Assuming data['text'] contains the text of the posts.
    text = data['text'].str.cat(sep=' ').lower()  # Combine all text and lowercase it.

    # Tokenize the text
    tokenizer = Tokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # Create a frequency distribution of the tokens
    word_freq = Counter(tokens)

    # Select the top n words
    top_n_words = [word for word, freq in word_freq.most_common(n)]

    return top_n_words



##Dataframe for LDA and RoBERTa features, symptom

In [15]:
# Combine control_data and symptom_data
control_data = control_data[['symptom', 'text']]
symptom_data = symptom_data[['symptom', 'text']]

# Now concatenate them
combined_data = pd.concat([symptom_data, control_data], ignore_index=True)


# Initialize an empty DataFrame to store features
features_df = pd.DataFrame()

# Add the symptoms and text columns from your original data to this new DataFrame
features_df['symptom'] = combined_data['symptom']
features_df['text'] = combined_data['text']
features_df['label'] = None


# You can add empty columns for LDA and RoBERTa features which you will fill later
features_df['lda_features'] = None
features_df['roberta_features'] = None

# # Lable 'Control' as 0
# features_df['label'] = features_df['symptom'].apply(lambda x: 0 if x == 'Control' else None)




In [17]:
features_df['symptom'].unique()

array(['Suicidal thoughts and attempts', 'Loneliness', 'Anxiety',
       'Anhedonia', 'Sleep problem', 'Self-loathing', 'Sad mood',
       'Somatic complaint', 'Disordered eating', 'Worthlessness', 'Anger',
       'Concentration deficit', 'Fatigue', 'Control'], dtype=object)

## RoBERTa Embeddings

In [None]:
# TODO: Your RoBERTa code!


# Function to get the 10th layer embeddings from the RoBERTa model for each text in features_df
def populate_RoBERTa_embeddings(features_df):

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Load pre-trained model (weights)
    model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)

    # Ensure model is in evaluation mode, which deactivates dropout
    model.eval()

    # Initialize an empty list to store the embeddings
    roberta_features_list = []

    # Process all texts to get embeddings
    texts = features_df['text'].tolist()
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_input)

    # Retrieve the embeddings from the 10th layer
    layer_embeddings = outputs.hidden_states[10]  # Indexing starts from 0, so 10 means the 11th layer

    # Calculate the mean of all token embeddings for each text
    for embedding in layer_embeddings:
        mean_embedding = torch.mean(embedding, dim=0)
        roberta_features_list.append(mean_embedding.detach().cpu().numpy().tolist())

    # Assign the embeddings to the 'roberta_features' column
    features_df['roberta_features'] = roberta_features_list

    return features_df

# Example usage:
features_df = populate_RoBERTa_embeddings(features_df)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
features_df.head(10)

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [None]:
# TODO: Your LDA code!

# Function to tokenize and remove stop words
def preprocess_data(tokenize_data):
    # Tokenize the data
    tokenize_data = tokenize(tokenize_data)
    # Get the top 100 words
    top_100_stop_words = stop_words(tokenize_data, n=100)
    # Remove the top 100 words
    tokenize_data['remove_stop_word_tokens'] = tokenize_data['tokens'].apply(lambda tokens: [token for token in tokens if token not in top_100_stop_words])
    return tokenize_data

# Function to implement LDA feature
def lda_for_each_symptom(features_df):
    lda_models = {}

    # Convert symptoms into a list
    symptoms = features_df['symptom'].unique().tolist()

    # Preprocess data by tokenize and removing stop words
    features_df = preprocess_data(features_df)

    # Run LDA for each symptom
    for symptom in symptoms:
        current_data = features_df[features_df['symptom'] == symptom]
        if current_data.empty:
            print(f"No data available for symptom '{symptom}'.")
            continue

        # Create a dictionary and corpus for the current symptom
        id2word_current = corpora.Dictionary(current_data['remove_stop_word_tokens'])
        # id2word_current.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
        if len(id2word_current) == 0:
            print(f"No words left after filtering for symptom '{symptom}'.")
            continue

        corpus_current = [id2word_current.doc2bow(text) for text in current_data['remove_stop_word_tokens']]
        if not corpus_current:
            print(f"No corpus could be built for symptom '{symptom}'.")
            continue

        # Create LDA model for the current symptom
        lda_model_current = LdaMulticore(corpus=corpus_current,
                                         id2word=id2word_current,
                                         num_topics=200,
                                         passes=10,
                                         workers=2)

        # Save the model for the current symptom
        lda_models[symptom] = lda_model_current

        # Populate the 'lda_features' column for the corresponding symptom
        for index, row in features_df[features_df['symptom'] == symptom].iterrows():
            bow = id2word_current.doc2bow(row['remove_stop_word_tokens'])
            features_df.at[index, 'lda_features'] = lda_model_current.get_document_topics(bow)

    return features_df, lda_models

features_df, lda_models = lda_for_each_symptom(features_df)


In [None]:

def preprocess_data(tokenize_data):
    # Assume tokenize is a function you've defined elsewhere to tokenize the data
    tokenize_data = tokenize(tokenize_data)
    # Assume stop_words is a function you've defined to get the top N stop words
    top_100_stop_words = stop_words(tokenize_data, n=100)
    tokenize_data['remove_stop_word_tokens'] = tokenize_data['tokens'].apply(
        lambda tokens: [token for token in tokens if token not in top_100_stop_words]
    )
    return tokenize_data

def lda_for_each_symptom(features_df):
    lda_models = {}
    symptoms = features_df['symptom'].unique().tolist()
    features_df = preprocess_data(features_df)

    # We first find out the maximum number of topics a document might have
    # to initialize our feature vectors. We'll use this value to create
    # feature vectors of uniform length.
    num_topics = 200  # This should match the number of topics you use in LDA

    for symptom in symptoms:
        current_data = features_df[features_df['symptom'] == symptom]
        if current_data.empty:
            print(f"No data available for symptom '{symptom}'.")
            continue

        id2word_current = corpora.Dictionary(current_data['remove_stop_word_tokens'])
        if len(id2word_current) == 0:
            print(f"No words left after filtering for symptom '{symptom}'.")
            continue

        corpus_current = [id2word_current.doc2bow(text) for text in current_data['remove_stop_word_tokens']]
        if not corpus_current:
            print(f"No corpus could be built for symptom '{symptom}'.")
            continue

        lda_model_current = LdaMulticore(corpus=corpus_current,
                                         id2word=id2word_current,
                                         num_topics=num_topics,
                                         passes=10,
                                         workers=2)
        lda_models[symptom] = lda_model_current

        # Now we process the documents to create uniform LDA feature vectors
        for index, row in features_df[features_df['symptom'] == symptom].iterrows():
            bow = id2word_current.doc2bow(row['remove_stop_word_tokens'])
            lda_features = lda_model_current.get_document_topics(bow, minimum_probability=0)

            # Initialize a vector of zeros with length equal to the number of topics
            feature_vector = np.zeros(num_topics)

            # Populate the feature vector using the LDA features (topic ID, topic proportion)
            for topic_id, topic_prob in lda_features:
                feature_vector[topic_id] = topic_prob

            # Assign this feature vector to the dataframe
            features_df.at[index, 'lda_features'] = feature_vector.tolist()

    return features_df, lda_models

# Use the function
features_df, lda_models = lda_for_each_symptom(features_df)


In [None]:
features_df.head(10)

## Main

In [None]:
import warnings
warnings.filterwarnings('ignore')

def main(features_df):
    # Initialize the classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize a DataFrame to store the scores
    scores_df = pd.DataFrame(columns=['Symptom', 'LDA-Feature Score', 'RoBERTa Score'])

    # Loop over each symptom
    for symptom in features_df['symptom'].unique():
        # Create a new dataframe for the specific symptom vs control
        symptom_df = features_df[(features_df['symptom'] == symptom) | (features_df['symptom'] == 'Control')].copy()

        # Update the labels in the dataframe: 1 for the symptom, 0 for control
        symptom_df['label'] = (symptom_df['symptom'] == symptom).astype(int)

        # Extract the LDA and RoBERTa features
        X_lda = np.stack(symptom_df['lda_features'].values)
        X_roberta = np.stack(symptom_df['roberta_features'].values)
        y = symptom_df['label'].values

        # Perform cross-validation for LDA features
        lda_scores = cross_val_score(rf_classifier, X_lda, y, cv=cv, scoring='roc_auc')
        lda_mean_score = lda_scores.mean()

        # Perform cross-validation for RoBERTa features
        roberta_scores = cross_val_score(rf_classifier, X_roberta, y, cv=cv, scoring='roc_auc')
        roberta_mean_score = roberta_scores.mean()

        # Append the scores to the DataFrame
        scores_df = scores_df.append({
            'Symptom': symptom,
            'LDA-Feature Score': lda_mean_score,
            'RoBERTa Score': roberta_mean_score
        }, ignore_index=True)

    return scores_df

main(features_df)
