# Reddit Depression Detection
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/CSCI 1460/Final Project: Reddit Depression/student.pkl'

Mounted at /content/drive


## Preprocessing

In [None]:
def load():
  """Load pickles"""
  # Load the dataset
  with open(FILEPATH, "rb") as file:
    dataset = pd.read_pickle(file)
  # Convert the dataset to a pandas DataFrame
  return pd.DataFrame(dataset)

In [None]:
# Load and examine the dataset
dataset = load()
print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1958158 entries, 0 to 1969753
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   text         object
 1   author       object
 2   subreddit    object
 3   created_utc  int64 
 4   date         object
dtypes: int64(1), object(4)
memory usage: 89.6+ MB
None
                                                text            author  \
0  does your life feel like a waste mines not a c...        trademeple   
1  Just relapsed again. Any advice I just got to ...          kenny818   
2  Audio and mic not working? So I have a HyperX ...          psyjinks   
3  PG&amp;E: Mylar balloon causes outage in centr...            Majnum   
4                                    Um... Forward?   OldManoftheNorth   

     subreddit  created_utc     date  
0   depression   1504920055  2017-09  
1        NoFap   1507890053  2017-10  
2  techsupport   1513558467  2017-12  
3  nottheonion   1499573023  2017-07  
4        memes   15168

In [None]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]
# Symptom mapping in the paper
symptom_to_subreddits = {
    "Anger": ["Anger"],
    "Anhedonia": ["anhedonia", "DeadBedrooms"],
    "Anxiety": ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"],
    "Concentration deficit": ["DecisionMaking", "shouldi"],
    "Disordered eating": ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"],
    "Fatigue": ["chronicfatigue", "Fatigue"],
    "Loneliness": ["ForeverAlone", "lonely"],
    "Sad mood": ["cry", "grief", "sad", "Sadness"],
    "Self-loathing": ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"],
    "Sleep problem": ["insomnia", "sleep"],
    "Somatic complaint": ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"],
    "Suicidal thoughts and attempts": ["AdultSelfHarm", "selfharm", "SuicideWatch"],
    "Worthlessness": ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"],
}

In [None]:
from datetime import timedelta

def dataset_generation(dataset, symptom_to_subreddits, depression_subreddits):
    """Build control and symptom datasets"""
    # Convert 'created_utc' to a datetime object
    dataset['created_utc'] = pd.to_datetime(dataset['created_utc'], unit='s')

    # Standardize subreddit names in the dataset by lowercasing
    depression_subreddits = [sub.lower() for sub in depression_subreddits]
    dataset['subreddit'] = dataset['subreddit'].str.lower()

    # Create datasets for each symptom
    symptom_datasets = {}
    for symptom, subreddit_list in symptom_to_subreddits.items():
        standardized_subreddits = [sub.lower().strip() for sub in subreddit_list]
        symptom_datasets[symptom] = dataset[dataset['subreddit'].isin(standardized_subreddits)]

    # Identify the earliest mental health post for each author
    mental_health_posts = dataset[dataset['subreddit'].isin(depression_subreddits)]
    earliest_mh_post = mental_health_posts.groupby('author')['created_utc'].min().reset_index()
    earliest_mh_post.rename(columns={'created_utc': 'earliest_mental_health_post'}, inplace=True)

    # Merge the earliest mental health post information back into the dataset
    dataset = dataset.merge(earliest_mh_post, on='author', how='left')

    # Create the control dataset
    control_dataset = dataset[
        (~dataset['subreddit'].isin(depression_subreddits)) &
        (dataset['created_utc'] <= (dataset['earliest_mental_health_post'] - timedelta(days=180)))
    ]

    return symptom_datasets, control_dataset

In [None]:
# Examine the symptom and control datasets
symptom_datasets, control_dataset = dataset_generation(dataset, symptom_to_subreddits, depression_subreddits)
total_symptom_posts = sum(len(df) for df in symptom_datasets.values())
print(f"Total posts in symptom datasets: {total_symptom_posts}")
print(f"Total posts in control dataset: {len(control_dataset)}")
print("\nSample dataset for 'Anger':")
print(symptom_datasets["Anger"].head())
print("\nSample control dataset:")
print(control_dataset.head())

Total posts in symptom datasets: 96264
Total posts in control dataset: 4369

Sample dataset for 'Anger':
                                                    text       author  \
5338   Advice on dealing with anger? Normally I'm a c...  NeedHelpaaa   
5594   I've been to anger management 10 times all it'...  WARHULK6661   
11246                     Ripping heads off :) [removed]   Belimawr23   
13284  Things that piss me off most. Being lonely. \n...    souptrees   
20424                          Weird black guy [removed]    ArtDeco83   

      subreddit         created_utc     date  
5338      anger 2017-09-17 20:59:35  ression  
5594      anger 2017-11-24 00:39:25  ression  
11246     anger 2017-10-04 09:05:55  ression  
13284     anger 2017-12-13 01:31:56  ression  
20424     anger 2017-08-26 22:28:04  ression  

Sample control dataset:
                                                   text                author  \
315            Man, I do love me some Bandicoot crash.    BuddermanT

In [None]:
!pip install happiestfuntokenizing
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

def tokenize(symptom_data, control_data):
    """Tokenize"""
    tokenizer = Tokenizer()

    # Tokenize symptom datasets
    tokenized_symptom_data = {}
    for symptom, dataset in symptom_data.items():
        tokenized_symptom_data[symptom] = []
        for _, row in dataset.iterrows():
            # Tokenize each post's text
            tokens = tokenizer.tokenize(row["text"])
            tokenized_symptom_data[symptom].append(tokens)

    # Tokenize control dataset
    tokenized_control_data = []
    for _, row in control_data.iterrows():
        # Tokenize each post's text
        tokens = tokenizer.tokenize(row["text"])
        tokenized_control_data.append(tokens)

    return tokenized_symptom_data, tokenized_control_data

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6711 sha256=436638b5160158fd9e30e573b1172c2fb0f12dfce20ebd4626c91fdc90a2857c
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7


In [None]:
# Examine the tokens
tokenized_symptom_data, tokenized_control_data = tokenize(symptom_datasets, control_dataset)
print("Tokens for 'Anxiety':")
print(tokenized_symptom_data["Anxiety"][:2])
print("\nTokens for control dataset (first 2 posts):")
print(tokenized_control_data[:2])

Tokens for 'Anxiety':
[['study', 'hall', 'social', 'anxiety', 'bruh', 'we', 'had', 'a', 'study', 'hall', 'in', 'gym', 'again', ',', 'as', 'usual', '.', 'i', 'usually', 'love', 'these', 'since', 'me', 'and', 'my', '2', 'friends', 'in', 'that', 'class', 'can', 'get', 'a', 'table', 'to', 'ourselves', 'and', 'be', 'loud', 'and', 'stupid', 'and', 'just', 'laugh', 'and', 'stuff', '.', 'but', 'today', 'i', 'guess', 'there', 'weren', '’', 't', 'as', 'many', 'tables', ',', 'so', 'we', 'shared', 'with', 'a', 'group', 'of', '“', 'popular', '”', 'girls', '.', 'this', 'wasn', '’', 't', 'a', 'big', 'deal', 'for', 'my', 'other', 'friends', ',', 'but', 'for', 'me', 'it', 'was', 'a', 'fucking', 'nightmaaare', '.', 'i', 'just', 'didn', '’', 't', 'have', 'the', 'confidence', 'to', 'speak', 'and', 'tell', 'jokes', 'and', 'stuff', '.', 'i', 'was', 'actually', 'sweating', 'at', 'the', 'presence', 'of', 'these', 'other', 'people', '.', 'i', 'was', 'avoiding', 'eye', 'contact', 'and', 'trying', 'to', 'face', 

In [None]:
from collections import Counter

def stop_words(tokenized_symptom_posts, tokenized_control_posts):
    """Find top 100 words from the control dataset to use as stop words"""
    # Flatten all tokens across control posts for frequency analysis
    control_tokens = [
        word for post in tokenized_control_posts for word in post
    ]
    token_frequencies = Counter(control_tokens)

    # Determine the top 100 most frequent words
    common_tokens = {word for word, _ in token_frequencies.most_common(100)}

    # Filter out stop words from symptom posts
    filtered_symptom_posts = {
        symptom: [
            [token for token in post_tokens if token not in common_tokens]
            for post_tokens in posts
        ]
        for symptom, posts in tokenized_symptom_posts.items()
    }

    # Filter out stop words from control posts
    filtered_control_posts = [
        [token for token in post_tokens if token not in common_tokens]
        for post_tokens in tokenized_control_posts
    ]

    return filtered_symptom_posts, filtered_control_posts

In [None]:
# Examine the filtered tokens
filtered_symptom_posts, filtered_control_posts = stop_words(tokenized_symptom_data, tokenized_control_data)
print("\nFiltered tokens for loneliness posts (first 2 posts):")
print(filtered_symptom_posts["Loneliness"][:2])
print("\nFiltered tokens for control posts (first 2 posts):")
print(filtered_control_posts[:2])


Filtered tokens for loneliness posts (first 2 posts):
[['friend', 'blanking', 'feels', 'months', 'feel', 'complete', "sh't", 'asking', 'advice', 'frustration', 'here', 'feel', 'goddamn', 'alone', 'trying', 'job', 'year', 'finally', 'got', 'interview', 'nobody', 'tell', 'feel', 'always', 'try', 'hard', 'make', 'friends', 'gappy', 'need', 'whenever', 'lowest', "they're", 'never', 'school', 'nobody', 'college', 'nobody', 'someone', 'wants', 'friends', 'everything', 'fucks', 'after', 'two', 'years', 'luck', '..'], ['2meirl42meirl4meirl']]

Filtered tokens for control posts (first 2 posts):
[['man', 'love', 'bandicoot', 'crash'], ['pc', '700-750', '$', 'budget', 'gaming', 'high', 'ultra', 'settings', 'thanks', 'https://www.youtube.com/watch', 'https://www.youtube.com/watch', 'v', '=', 'y_ulqrs', '76xs', 't', '=', '110s']]


## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

IGNORED_SYMPTOMS = {"Fatigue", "Concentration deficit", "Suicidal thoughts and attempts"}

def prepare_documents(filtered_symptom_posts, filtered_control_posts):
    """
    Prepare documents for LDA by combining symptom and control tokens.
    """
    # Combine symptom and control tokens into documents
    documents = [" ".join(tokens) for posts in filtered_symptom_posts.values() for tokens in posts]
    documents.extend([" ".join(tokens) for tokens in filtered_control_posts])
    return documents

def generate_lda_inputs(documents):
    """
    Generate corpus, dictionary, and term-document matrix for LDA.
    """
    # Vectorize documents
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(documents)

    # Tokenize documents and create dictionary
    tokenized_docs = [doc.split() for doc in documents]
    dictionary = Dictionary(tokenized_docs)

    # Create the corpus
    corpus = [
        [(word_id, count) for word_id, count in zip(doc.indices, doc.data)]
        for doc in bow_matrix
    ]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
    return lda_model, corpus, dictionary

def compute_topic_matrix(lda_model, documents, dictionary):
    """
    Compute topic distribution matrix for given documents.
    """
    corpus = [dictionary.doc2bow(doc.split()) for doc in documents]
    return np.array([
        [prob for _, prob in lda_model.get_document_topics(doc, minimum_probability=0)]
        for doc in corpus
    ])

def evaluate_lda(lda_model, dictionary, filtered_symptom_posts, filtered_control_posts):
    """
    Perform one-vs-many classification for each symptom and compute AUC scores.
    """
    auc_results = {}

    # Compute topic distributions for control posts
    control_docs = [" ".join(tokens) for tokens in filtered_control_posts]
    control_topic_matrix = compute_topic_matrix(lda_model, control_docs, dictionary)

    for symptom, posts in filtered_symptom_posts.items():
        if symptom in IGNORED_SYMPTOMS:
            # Exclude ignored symptoms from evaluation
            continue

        # Compute topic distributions for symptom posts
        symptom_docs = [" ".join(tokens) for tokens in posts]
        symptom_topic_matrix = compute_topic_matrix(lda_model, symptom_docs, dictionary)

        # Combine features and labels
        X = np.vstack([symptom_topic_matrix, control_topic_matrix])
        y = np.concatenate([np.ones(len(symptom_topic_matrix)), np.zeros(len(control_topic_matrix))])

        # Perform 5-fold cross-validation
        rf_classifier = RandomForestClassifier(random_state=42)
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

        auc_results[symptom] = {
            "mean_train_auc": np.mean(scores["train_score"]),
            "mean_test_auc": np.mean(scores["test_score"])
        }

    return auc_results

# Prepare inputs
all_documents = prepare_documents(filtered_symptom_posts, filtered_control_posts)
lda_model, lda_corpus, lda_dictionary = generate_lda_inputs(all_documents)

# Evaluate model
final_auc_results = evaluate_lda(lda_model, lda_dictionary, filtered_symptom_posts, filtered_control_posts)

# Print summary
print("\nFinal Results:")
for symptom, scores in final_auc_results.items():
    print(f"{symptom}: Mean AUC = {scores['mean_test_auc']:.4f}")


Final Results:
Anger: Mean AUC = 0.7438
Anhedonia: Mean AUC = 0.8370
Anxiety: Mean AUC = 0.7417
Disordered eating: Mean AUC = 0.7416
Loneliness: Mean AUC = 0.6505
Sad mood: Mean AUC = 0.6287
Self-loathing: Mean AUC = 0.6831
Sleep problem: Mean AUC = 0.7634
Somatic complaint: Mean AUC = 0.7090
Worthlessness: Mean AUC = 0.6066


## RoBERTa Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_validate
from tqdm import tqdm

IGNORED_SYMPTOMS = {"Fatigue", "Concentration deficit", "Suicidal thoughts and attempts"}

def load_roberta_model():
    """
    Load the DistilRoBERTa tokenizer and model.
    """
    tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
    model = AutoModel.from_pretrained("distilroberta-base", output_hidden_states=True)
    model.to("cuda")  # Use GPU
    model.eval()
    return tokenizer, model

def compute_document_embeddings(documents, tokenizer, model, max_tokens=512):
    """
    Compute embeddings for a list of documents using DistilRoBERTa.
    """
    embeddings = []
    for doc in tqdm(documents, desc="Processing documents"):
        encoded_input = tokenizer(
            doc,
            padding="max_length",
            truncation=True,
            max_length=max_tokens,
            return_tensors="pt"
        )
        input_ids = encoded_input["input_ids"].to("cuda")
        attention_mask = encoded_input["attention_mask"].to("cuda")

        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)
            layer_hidden_state = output.hidden_states[5]  # Extract 5th layer embeddings
            doc_embedding = layer_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Mean pooling
            embeddings.append(doc_embedding)
    return np.array(embeddings)

def calculate_auc_scores(symptom_data, control_data, tokenizer, model):
    """
    Compute AUC scores for all symptoms except the ignored ones.
    """
    auc_results = {}

    # Compute control embeddings
    control_documents = [" ".join(tokens) for tokens in control_data]
    control_embeddings = compute_document_embeddings(control_documents, tokenizer, model)

    for symptom, posts in symptom_data.items():
        if symptom in IGNORED_SYMPTOMS:
            # Skip evaluation for ignored symptoms
            continue

        symptom_documents = [" ".join(tokens) for tokens in posts]
        symptom_embeddings = compute_document_embeddings(symptom_documents, tokenizer, model)

        # Combine embeddings and labels
        X = np.vstack([symptom_embeddings, control_embeddings])
        y = np.concatenate([np.ones(len(symptom_embeddings)), np.zeros(len(control_embeddings))])

        # Perform 5-fold cross-validation
        rf_classifier = RandomForestClassifier(random_state=42)
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring="roc_auc", return_train_score=True)

        auc_results[symptom] = {
            "mean_train_auc": np.mean(scores["train_score"]),
            "mean_test_auc": np.mean(scores["test_score"])
        }

    return auc_results

# Load model and tokenizer
tokenizer, model = load_roberta_model()

# Compute embeddings and AUC scores
symptom_mean_aucs = calculate_auc_scores(filtered_symptom_posts, filtered_control_posts, tokenizer, model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Processing documents: 100%|██████████| 4369/4369 [01:04<00:00, 67.38it/s]
Processing documents: 100%|██████████| 555/555 [00:08<00:00, 63.02it/s]
Processing documents: 100%|██████████| 5934/5934 [01:37<00:00, 60.76it/s]
Processing documents: 100%|██████████| 24514/24514 [06:34<00:00, 62.06it/s]
Processing documents: 100%|██████████| 1789/1789 [00:29<00:00, 60.26it/s]
Processing documents: 100%|██████████| 11535/11535 [03:03<00:00, 62.83it/s]
Processing documents: 100%|██████████| 2222/2222 [00:36<00:00, 60.74it/s]
Processing documents: 100%|██████████| 9865/9865 [02:39<00:00, 61.94it/s]
Processing documents: 100%|██████████| 3184/3184 [00:51<00:00, 61.53it/s]
Processing documents: 100%|██████████| 8330/8330 [02:14<00:00, 61.99it/s]
Processing documents: 100%|██████████| 1805/1805 [00:29<00:00, 61.67it/s]


In [None]:
print("\nDistilRoBERTa Results:")
for symptom, scores in symptom_mean_aucs.items():
    print(f"{symptom}: Mean Train AUC = {scores['mean_train_auc']:.4f}, Mean Test AUC = {scores['mean_test_auc']:.4f}")


DistilRoBERTa Results:
Anger: Mean Train AUC = 1.0000, Mean Test AUC = 0.9214
Anhedonia: Mean Train AUC = 1.0000, Mean Test AUC = 0.9465
Anxiety: Mean Train AUC = 1.0000, Mean Test AUC = 0.9468
Disordered eating: Mean Train AUC = 1.0000, Mean Test AUC = 0.9466
Loneliness: Mean Train AUC = 0.9999, Mean Test AUC = 0.9021
Sad mood: Mean Train AUC = 1.0000, Mean Test AUC = 0.9059
Self-loathing: Mean Train AUC = 1.0000, Mean Test AUC = 0.9158
Sleep problem: Mean Train AUC = 1.0000, Mean Test AUC = 0.9545
Somatic complaint: Mean Train AUC = 1.0000, Mean Test AUC = 0.9183
Worthlessness: Mean Train AUC = 1.0000, Mean Test AUC = 0.8900


## Main

In [None]:
def main():
    print("Loading dataset.")
    dataset = load()

    print("\nGenerating symptom and control datasets.")
    symptom_datasets, control_dataset = dataset_generation(dataset, symptom_to_subreddits, depression_subreddits)

    print("\nTokenizing posts.")
    tokenized_symptom_data, tokenized_control_data = tokenize(symptom_datasets, control_dataset)

    print("\nFiltering out stop words.")
    filtered_symptom_posts, filtered_control_posts = stop_words(tokenized_symptom_data, tokenized_control_data)

    print("\nPreparing documents for LDA.")
    documents = prepare_documents(filtered_symptom_posts, filtered_control_posts)

    print("\nTraining LDA model and generating inputs.")
    lda_model, lda_corpus, lda_dictionary = generate_lda_inputs(documents)

    print("\nEvaluating LDA AUC scores.")
    lda_auc_scores = evaluate_lda(lda_model, lda_dictionary, filtered_symptom_posts, filtered_control_posts)

    print("\nLDA Results:")
    for symptom, scores in lda_auc_scores.items():
        print(f"{symptom}: Train AUC = {scores['mean_train_auc']:.4f}, Test AUC = {scores['mean_test_auc']:.4f}")

    print("\nLoading DistilRoBERTa model.")
    tokenizer, model = load_roberta_model()

    print("\nEvaluating DistilRoBERTa AUC scores.")
    roberta_auc_scores = calculate_auc_scores(filtered_symptom_posts, filtered_control_posts, tokenizer, model)

    print("\nDistilRoBERTa Results:")
    for symptom, scores in roberta_auc_scores.items():
        print(f"{symptom}: Train AUC = {scores['mean_train_auc']:.4f}, Test AUC = {scores['mean_test_auc']:.4f}")

main()

Loading dataset.

Generating symptom and control datasets.

Tokenizing posts.

Filtering out stop words.

Preparing documents for LDA.

Training LDA model and generating inputs.

Evaluating LDA AUC scores.

LDA Results:
Anger: Train AUC = 0.9562, Test AUC = 0.6928
Anhedonia: Train AUC = 0.9764, Test AUC = 0.8319
Anxiety: Train AUC = 0.9835, Test AUC = 0.7413
Disordered eating: Train AUC = 0.9603, Test AUC = 0.7462
Loneliness: Train AUC = 0.9588, Test AUC = 0.6389
Sad mood: Train AUC = 0.9487, Test AUC = 0.6490
Self-loathing: Train AUC = 0.9706, Test AUC = 0.6826
Sleep problem: Train AUC = 0.9712, Test AUC = 0.7792
Somatic complaint: Train AUC = 0.9757, Test AUC = 0.7142
Worthlessness: Train AUC = 0.9420, Test AUC = 0.6121

Loading DistilRoBERTa model.

Evaluating DistilRoBERTa AUC scores.


Processing documents: 100%|██████████| 4369/4369 [01:10<00:00, 62.13it/s]
Processing documents: 100%|██████████| 555/555 [00:08<00:00, 61.77it/s]
Processing documents: 100%|██████████| 5934/5934 [01:39<00:00, 59.82it/s]
Processing documents: 100%|██████████| 24514/24514 [06:36<00:00, 61.77it/s]
Processing documents: 100%|██████████| 1789/1789 [00:30<00:00, 59.50it/s]
Processing documents: 100%|██████████| 11535/11535 [03:06<00:00, 61.90it/s]
Processing documents: 100%|██████████| 2222/2222 [00:36<00:00, 60.07it/s]
Processing documents: 100%|██████████| 9865/9865 [02:41<00:00, 61.14it/s]
Processing documents: 100%|██████████| 3184/3184 [00:52<00:00, 60.71it/s]
Processing documents: 100%|██████████| 8330/8330 [02:15<00:00, 61.34it/s]
Processing documents: 100%|██████████| 1805/1805 [00:29<00:00, 60.67it/s]



DistilRoBERTa Results:
Anger: Train AUC = 1.0000, Test AUC = 0.9214
Anhedonia: Train AUC = 1.0000, Test AUC = 0.9465
Anxiety: Train AUC = 1.0000, Test AUC = 0.9468
Disordered eating: Train AUC = 1.0000, Test AUC = 0.9466
Loneliness: Train AUC = 0.9999, Test AUC = 0.9021
Sad mood: Train AUC = 1.0000, Test AUC = 0.9059
Self-loathing: Train AUC = 1.0000, Test AUC = 0.9158
Sleep problem: Train AUC = 1.0000, Test AUC = 0.9545
Somatic complaint: Train AUC = 1.0000, Test AUC = 0.9183
Worthlessness: Train AUC = 1.0000, Test AUC = 0.8900
