# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [1]:
!pip install dlatk
!pip install happiestfuntokenizing
!pip install gensim
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from collections import Counter
import gensim
import gensim.corpora as corpora
from transformers import RobertaTokenizer, RobertaModel
import torch
from gensim.models.ldamulticore import LdaMulticore
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer
from sklearn.metrics import roc_auc_score
from collections import Counter


from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/1460HW/student.pkl'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing

In [3]:
def load(file_path):
    """
    Load and return the data from a pickle file.

    Parameters:
    file_path (str): The path to the .pkl file to be loaded.

    Returns:
    data: The data loaded from the .pkl file.
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

# call the function to display the dataset
data = load(FILEPATH)
data.head()

Unnamed: 0,text,author,subreddit,created_utc,date
0,does your life feel like a waste mines not a c...,trademeple,depression,1504920055,2017-09
1,Just relapsed again. Any advice I just got to ...,kenny818,NoFap,1507890053,2017-10
2,Audio and mic not working? So I have a HyperX ...,psyjinks,techsupport,1513558467,2017-12
3,PG&amp;E: Mylar balloon causes outage in centr...,Majnum,nottheonion,1499573023,2017-07
4,Um... Forward?,OldManoftheNorth,memes,1516842851,2018-01


In [4]:
# get the dataset shape
data.shape

(1958158, 5)

In [5]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

In [6]:
# Map the subreddits to each sympton
subreddit_to_sympton = {
    "Anger": "Anger",
    "anhedonia": "Anhedonia",
    "DeadBedrooms": "Anhedonia",
    "Anxiety": "Anxiety",
    "AnxietyDepression": "Anxiety",
    "HealthAnxiety": "Anxiety",
    "PanicAttack": "Anxiety",
    "DecisionMaking": "Concentration deficit",
    "shouldi": "Concentration deficit",
    "bingeeating": "Disordered eating",
    "BingeEatingDisorder": "Disordered eating",
    "EatingDisorders": "Disordered eating",
    "eating_disorders": "Disordered eating",
    "EDAnonymous": "Disordered eating",
    "chronicfatigue": "Fatigue",
    "Fatigue": "Fatigue",
    "ForeverAlone": "Loneliness",
    "lonely": "Loneliness",
    "cry": "Sad mood",
    "grief": "Sad mood",
    "sad": "Sad mood",
    "Sadness": "Sad mood",
    "AvPD": "Self-loathing",
    "SelfHate": "Self-loathing",
    "selfhelp": "Self-loathing",
    "selfhelp": "Worthlessness",
    "socialanxiety": "Self-loathing",
    "whatsbotheringyou": "Self-loathing",
    "whatsbotheringyou": "Worthlessness",
    "insomnia": "Sleep problem",
    "sleep": "Sleep problem",
    "cfs": "Somatic complaint",
    "ChronicPain": "Somatic complaint",
    "Constipation": "Somatic complaint",
    "EssentialTremor": "Somatic complaint",
    "headaches": "Somatic complaint",
    "ibs": "Somatic complaint",
    "tinnitus": "Somatic complaint",
    "AdultSelfHarm": "Suicidal thoughts and attempts",
    "selfharm": "Suicidal thoughts and attempts",
    "SuicideWatch": "Suicidal thoughts and attempts",
    "Guilt": "Worthlessness",
    "Pessimism": "Worthlessness",
}


# Custom function to get symptom
def get_symptom(subreddit):
    return subreddit_to_sympton.get(subreddit, 'Control')

# Use apply to create the new column 'symptom'
data['symptom'] = data['subreddit'].apply(get_symptom)



In [7]:
print(data["symptom"].unique())

['Control' 'Suicidal thoughts and attempts' 'Loneliness' 'Anxiety'
 'Anhedonia' 'Sleep problem' 'Self-loathing' 'Sad mood'
 'Somatic complaint' 'Disordered eating' 'Worthlessness' 'Anger'
 'Concentration deficit' 'Fatigue']


In [8]:
def dataset_generation(data, depression_subs):
    """
    Build control and symptom datasets.

    Parameters:
    - data: pandas DataFrame containing the Reddit posts data
    - depression_subs: list of subreddits related to depression symptoms

    Returns:
    - symptom_df: DataFrame containing posts from depression-related subreddits
    - control_df: DataFrame containing control posts that are at least 180 days older than the author's first depression-related post
    """
    # Filter out symptom dataset where symptom not equal to control
    symptom_df = data[data['symptom']!= 'Control']

     # Convert 'created_utc' to datetime
    data['date'] = pd.to_datetime(data['created_utc'], unit='s')

    # Identify authors with depression-related posts and find the earliest post's created_utc
    depression_authors = symptom_df.groupby('author')['created_utc'].min()

    # Map the minimum created_utc of depression posts for each author back to the original dataframe
    data['min_depression_utc'] = data['author'].map(depression_authors)

    # Calculate the time difference in days between each post and the earliest depression post for the same author
    data['days_diff'] = (data['created_utc'] - data['min_depression_utc']) / (60 * 60 * 24)

    # Filter for control posts that are at least 180 days older than the author's earliest depression-related post
    control_df = data[(data['symptom'] == 'Control') & (data['days_diff'] <= -180)]


    # Return the symptom and control dataframes
    return symptom_df, control_df


symptom_data, control_data = dataset_generation(data, depression_subreddits)



In [9]:
# Check the shape of symptom and control dataset
print('sympton dataset shape:', symptom_data.shape, 'control dataset shape:', control_data.shape)

sympton dataset shape: (94514, 6) control dataset shape: (4369, 8)


In [10]:
control_data.head()

Unnamed: 0,text,author,subreddit,created_utc,date,symptom,min_depression_utc,days_diff
315,"Man, I do love me some Bandicoot crash.",BuddermanTheAmazing,crappyoffbrands,1499236239,2017-07-05 06:30:39,Control,1517145000.0,-207.282731
651,How good is this PC for my 700-750$ budget? Wa...,WildernessExploring,buildmeapc,1501296261,2017-07-29 02:44:21,Control,1517346000.0,-185.764525
730,When is the price of gpus going down? I know t...,NeighborhoodPizzaGuy,pcmasterrace,1500082729,2017-07-15 01:38:49,Control,1516768000.0,-193.110938
1354,Our service is not available in your area. Hey...,xDEDANx,njpw,1499941432,2017-07-13 10:23:52,Control,1515534000.0,-180.474722
1598,Wow,baby_kicked,indianpeoplefacebook,1500924182,2017-07-24 19:23:02,Control,1517022000.0,-186.314271


In [11]:
symptom_data.head()

Unnamed: 0,text,author,subreddit,created_utc,date,symptom
20,"i'm trying hi, i'm sorry if my writing is bad,...",n90300118,SuicideWatch,1510374743,ression,Suicidal thoughts and attempts
39,Only friend has been blanking me for what feel...,Throwaway34qwas,lonely,1505308711,ression,Loneliness
67,Study hall social anxiety bruh We had a study ...,Shwin280,Anxiety,1515634258,ression,Anxiety
72,Positive Thoughts For You - We Are Happy To Pu...,pthinkimag,Anxiety,1515944819,ression,Anxiety
79,Starting from a blowup mattress Today was a ve...,MyCrazyLove,SuicideWatch,1516594948,ression,Suicidal thoughts and attempts


In [12]:
# Assuming symptom_data is your DataFrame and it has more than 900 rows
#symptom_data = symptom_data.sample(n=200, random_state=42)
#control_data = control_data.sample(n=100, random_state=42)

In [13]:
# Tokenize the text data

def tokenize(data):
  """Tokenize each message in the DataFrame."""

  tokenizer = Tokenizer(preserve_case=False)

  # Apply tokenization to each message
  data['tokens'] = data['text'].apply(tokenizer.tokenize)

  return data



In [14]:
def stop_words(data, n=100):
    """
    Find top n words from the dataset to use as stop words.

    Parameters:
    - data: pandas DataFrame containing the dataset.
    - n: Number of top words to return. Default is 100.

    Returns:
    - top_n_words: List containing the top n words.
    """

    # Flatten the list of lists of tokens into a single list
    all_tokens = [token for sublist in data['tokens'].tolist() for token in sublist]

    # Create a frequency distribution of the tokens
    word_freq = Counter(all_tokens)

    # Select the top n words
    top_n_words = [word for word, freq in word_freq.most_common(n)]

    return top_n_words



##Dataframe for LDA and RoBERTa features, symptom

In [15]:
# Combine control_data and symptom_data
control_data = control_data[['symptom', 'text']]
symptom_data = symptom_data[['symptom', 'text']]

# Now concatenate them
combined_data = pd.concat([symptom_data, control_data], ignore_index=True)


# Initialize an empty DataFrame to store features
features_df = pd.DataFrame()

# Add the symptoms and text columns from your original data to this new DataFrame
features_df['symptom'] = combined_data['symptom']
features_df['text'] = combined_data['text']
features_df['label'] = None


# You can add empty columns for LDA and RoBERTa features which you will fill later
features_df['lda_features'] = None
features_df['roberta_features'] = None

# # Lable 'Control' as 0
# features_df['label'] = features_df['symptom'].apply(lambda x: 0 if x == 'Control' else None)




In [16]:
features_df['symptom'].unique()

array(['Suicidal thoughts and attempts', 'Loneliness', 'Anxiety',
       'Anhedonia', 'Sleep problem', 'Self-loathing', 'Sad mood',
       'Somatic complaint', 'Disordered eating', 'Worthlessness', 'Anger',
       'Concentration deficit', 'Fatigue', 'Control'], dtype=object)

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [17]:
# TODO: Your LDA code!
# Function to tokenize and remove stop words

def preprocess_data(tokenize_data):
    # Tokenize the data
    tokenize_data = tokenize(tokenize_data)
    #Get the top 100 words
    top_100_stop_words = stop_words(tokenize_data, n=100)
    # Remove the top 100 words
    tokenize_data['remove_stop_word_tokens'] = tokenize_data['tokens'].apply(lambda tokens: [token for token in tokens if token not in top_100_stop_words])

    return tokenize_data

# Function to implement LDA feature
def lda_for_each_symptom(features_df):
    lda_models = {}

    # Convert symptoms into a list
    symptoms = features_df['symptom'].unique().tolist()

    # Preprocess data by tokenize and removing stop words
    features_df = preprocess_data(features_df)

    # Run LDA for each symptom
    for symptom in symptoms:
        current_data = features_df[features_df['symptom'] == symptom]
        if current_data.empty:
            print(f"No data available for symptom '{symptom}'.")
            continue

        # Create a dictionary and corpus for the current symptom
        id2word_current = corpora.Dictionary(current_data['remove_stop_word_tokens'])
        # id2word_current.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
        if len(id2word_current) == 0:
            print(f"No words left after filtering for symptom '{symptom}'.")
            continue

        corpus_current = [id2word_current.doc2bow(text) for text in current_data['remove_stop_word_tokens']]
        if not corpus_current:
            print(f"No corpus could be built for symptom '{symptom}'.")
            continue

        # Create LDA model for the current symptom
        lda_model_current = LdaMulticore(corpus=corpus_current,
                                         id2word=id2word_current,
                                         num_topics=200,
                                         passes=10
                                        )

        # # Save the model for the current symptom
        # lda_models[symptom] = lda_model_current

        # Populate the 'lda_features' column for the corresponding symptom
        for index, row in features_df[features_df['symptom'] == symptom].iterrows():
            bow = id2word_current.doc2bow(row['remove_stop_word_tokens'])
            features_df.at[index, 'lda_features'] = lda_model_current.get_document_topics(bow)

    return features_df

features_df = lda_for_each_symptom(features_df)


In [20]:
features_df.to_csv('features_df_after_lda.csv', index = False)

In [18]:
features_df.head(10)

Unnamed: 0,symptom,text,label,lda_features,roberta_features,tokens,remove_stop_word_tokens
0,Suicidal thoughts and attempts,"i'm trying hi, i'm sorry if my writing is bad,...",,"[(2, 0.011878904), (4, 0.013924762), (8, 0.015...",,"[i'm, trying, hi, ,, i'm, sorry, if, my, writi...","[trying, hi, sorry, writing, bad, headache, di..."
1,Loneliness,Only friend has been blanking me for what feel...,,"[(13, 0.03809449), (42, 0.03952012), (79, 0.20...",,"[only, friend, has, been, blanking, me, for, w...","[friend, blanking, feels, months, complete, sh..."
2,Anxiety,Study hall social anxiety bruh We had a study ...,,"[(23, 0.012622987), (25, 0.01223013), (38, 0.0...",,"[study, hall, social, anxiety, bruh, we, had, ...","[study, hall, social, bruh, study, hall, gym, ..."
3,Anxiety,Positive Thoughts For You - We Are Happy To Pu...,,"[(25, 0.2010017), (81, 0.34892222), (132, 0.25...",,"[positive, thoughts, for, you, -, we, are, hap...","[positive, thoughts, happy, publish, !]"
4,Suicidal thoughts and attempts,Starting from a blowup mattress Today was a ve...,,"[(15, 0.042051014), (17, 0.017431594), (26, 0....",,"[starting, from, a, blowup, mattress, today, w...","[starting, blowup, mattress, today, very, “, d..."
5,Anhedonia,Love Language Opposites In the process of divo...,,"[(1, 0.16765498), (30, 0.031218914), (88, 0.02...",,"[love, language, opposites, in, the, process, ...","[love, language, opposites, process, divorce, ..."
6,Loneliness,2meirl42meirl4meirl,,"[(104, 0.50239164)]",,[2meirl42meirl4meirl],[2meirl42meirl4meirl]
7,Loneliness,It's almost 6AM and i can't sleep as always Yo...,,"[(6, 0.20872423), (13, 0.26478657), (21, 0.090...",,"[it's, almost, 6am, and, i, can't, sleep, as, ...","[almost, 6am, sleep, always, tired, spend, you..."
8,Sleep problem,Valerian root not working anymore...? Need her...,,"[(5, 0.02913489), (9, 0.014847244), (23, 0.037...",,"[valerian, root, not, working, anymore, ..., ?...","[valerian, root, working, anymore, ..., need, ..."
9,Suicidal thoughts and attempts,Probably the worst news i ever had received so...,,"[(2, 0.04122791), (13, 0.02863178), (15, 0.033...",,"[probably, the, worst, news, i, ever, had, rec...","[probably, worst, news, ever, received, far, m..."


## RoBERTa Embeddings

In [22]:
# TODO: Your RoBERTa code!

# Assuming device is defined at the top-level of your script
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the RoBERTa model and tokenizer if they are not already initialized
model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True).to(device)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model.eval()

def populate_RoBERTa_embeddings(features_df):
    # Convert tokens back to text strings because the RoBERTa tokenizer requires strings as input
    texts = [' '.join(tokens) for tokens in features_df['tokens']]

    # Encoding the tokenized text to RoBERTa's expected format
    # Set is_split_into_words=True if the input is pre-tokenized
    encoded_input = tokenizer(texts, is_split_into_words=True, padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**encoded_input)

    # Retrieve the embeddings from the 10th layer and calculate the mean
    roberta_features_list = [torch.mean(layer_embedding, dim=0).detach().cpu().numpy()
                             for layer_embedding in outputs.hidden_states[10]]

    # Assign the embeddings to the 'roberta_features' column
    features_df['roberta_features'] = roberta_features_list

    return features_df

# Usage
features_df = populate_RoBERTa_embeddings(features_df)




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: ignored

In [None]:
features_df.head(10)

## Main

In [None]:
import warnings
warnings.filterwarnings('ignore')

def main(features_df):
    # Initialize the classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize a DataFrame to store the scores
    scores_df = pd.DataFrame(columns=['Symptom', 'LDA-Feature Score', 'RoBERTa Score'])

    # Loop over each symptom
    for symptom in features_df['symptom'].unique():
        # Create a new dataframe for the specific symptom vs control
        symptom_df = features_df[(features_df['symptom'] == symptom) | (features_df['symptom'] == 'Control')].copy()

        # Update the labels in the dataframe: 1 for the symptom, 0 for control
        symptom_df['label'] = (symptom_df['symptom'] == symptom).astype(int)

        # Extract the LDA and RoBERTa features
        X_lda = np.stack(symptom_df['lda_features'].values)
        X_roberta = np.stack(symptom_df['roberta_features'].values)
        y = symptom_df['label'].values

        # Perform cross-validation for LDA features
        lda_scores = cross_val_score(rf_classifier, X_lda, y, cv=cv, scoring='roc_auc')
        lda_mean_score = lda_scores.mean()

        # Perform cross-validation for RoBERTa features
        roberta_scores = cross_val_score(rf_classifier, X_roberta, y, cv=cv, scoring='roc_auc')
        roberta_mean_score = roberta_scores.mean()

        # Append the scores to the DataFrame
        scores_df = scores_df.append({
            'Symptom': symptom,
            'LDA-Feature Score': lda_mean_score,
            'RoBERTa Score': roberta_mean_score
        }, ignore_index=True)

    return scores_df

main(features_df)
