### Finding subreddit names using Fuzzy Finder
Using the example of the phrase "mental health"

In [2]:
import praw
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

user_agent = "Scraper 1.0 by /u/anonymous20042007"
reddit = praw.Reddit(
    client_id="QMOPaONJbihjtLRLs4WhTw",
    client_secret="JFeDiuNLj_eeBVDn9YrJk9C-JrXefw",
    user_agent = user_agent
)
base_words = ["mental health"]

found_subreddits = set()
num_subreddits_to_find = 10

print("Searching for subreddits...")

for word in base_words:
    if len(found_subreddits) >= num_subreddits_to_find:
        break
    try:
        for subreddit in reddit.subreddits.search(query=word, limit=None): 
            if subreddit.display_name.lower() not in [sr.lower() for sr in found_subreddits]:
                match = process.extractOne(subreddit.display_name, base_words, scorer=fuzz.partial_ratio)
                if match and match[1] >= 70: 
                    found_subreddits.add(subreddit.display_name)
                    print(f"Found potential subreddit: {subreddit.display_name} (Matched on: '{word}', Fuzzy Score: {match[1]})")
                    if len(found_subreddits) >= num_subreddits_to_find:
                        break
        print(f"Finished searching with keyword: '{word}'")
    except Exception as e:
        print(f"Error during search with keyword '{word}': {e}")

print("\nFound Subreddits:")
if found_subreddits:
    for subreddit_name in list(found_subreddits)[:num_subreddits_to_find]:
        print(subreddit_name)
else:
    print("No relevant subreddits found based on the keywords.")

Searching for subreddits...
Found potential subreddit: mentalhealth (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: MentalHealthSupport (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: MentalHealthUK (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: MentalHealthPH (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: MensMentalHealth (Matched on: 'mental health', Fuzzy Score: 96)
Found potential subreddit: MentalHealthPros (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: HolisticMentalHealth (Matched on: 'mental health', Fuzzy Score: 96)
Found potential subreddit: BlackMentalHealth (Matched on: 'mental health', Fuzzy Score: 96)
Found potential subreddit: MentalHealthIsland (Matched on: 'mental health', Fuzzy Score: 92)
Found potential subreddit: MentalHealthHelp (Matched on: 'mental health', Fuzzy Score: 92)
Finished searching with keyword: 'mental health'

Found Subr

### Applying fastText Language Classifier to filter out texts  
Texts with a score of less than 0.5 will be filtered out

In [4]:
import pandas as pd
import fasttext
import requests

try:
    model = fasttext.load_model('lid.176.bin')
except ValueError:
    print("Downloading fastText language identification model...")
    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
    response = requests.get(url, stream=True)
    with open('lid.176.bin', 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    model = fasttext.load_model('lid.176.bin')
    print("Downloaded and loaded the fastText language identification model.")

In [6]:
import pandas as pd
import fasttext
import requests

model = fasttext.load_model('lid.176.bin')

def classify_language(text):
    try:
        predictions = model.predict(text, k=1)  
        language_code = predictions[0][0].replace('__label__', '')
        confidence = predictions[1][0]
        return language_code, confidence
    except Exception as e:
        return None, 0.0

df = pd.read_csv("processed_reddit_data.csv")

text_column_name = "proctext"

df[['language', 'language_confidence']] = df[text_column_name].apply(classify_language).apply(pd.Series)

english_texts_df = df[
    (df['language_confidence'] >= 0.5) | (df['language'] == 'en')
].copy()

removed_df = df[
    ~((df['language_confidence'] >= 0.5) | (df['language'] == 'en'))
].copy()

removed_count = removed_df.shape[0]

print(f"Original DataFrame shape: {df.shape}")
print(f"DataFrame shape after filtering (confidence >= 0.5 OR language == 'en'): {english_texts_df.shape}")
print(f"Number of texts removed: {removed_count}")

Original DataFrame shape: (4928, 14)
DataFrame shape after filtering (confidence >= 0.5 OR language == 'en'): (4810, 14)
Number of texts removed: 118


### Using Named Entity Recognition to find names and contact details
If any are found, they will be redacted

In [23]:
import pandas as pd
import spacy
import re

nlp = spacy.load("en_core_web_sm")

def extract_personal_info(text):
    if isinstance(text, str):
        doc = nlp(text)
        names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
        phones = re.findall(r"(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}", text)
        return names, emails, phones
    else:
        return [], [], []

df = pd.read_csv("data_before_processing.csv")
text_column = "selftext"
df[['names', 'emails', 'phones']] = df[text_column].apply(extract_personal_info).apply(pd.Series)

name_count = df['names'].apply(len).sum()
email_count = df['emails'].apply(len).sum()
phone_count = df['phones'].apply(len).sum()

print(f"Number of names found: {name_count}")
print(f"Number of emails found: {email_count}")
print(f"Number of phone numbers found: {phone_count}")

Number of names found: 1102
Number of emails found: 12
Number of phone numbers found: 6


### Topic Clustering/Modelling

##### Using Top2Vec

In [60]:
# Top2Vec
from top2vec import Top2Vec

df = pd.read_csv("data_before_processing.csv")
df['selftext'] = df['selftext'].fillna('') 
docs = df.selftext.tolist()
model = Top2Vec(docs)

2025-04-05 16:06:44,677 - top2vec - INFO - Pre-processing documents for training
2025-04-05 16:06:46,070 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-04-05 16:06:49,189 - top2vec - INFO - Creating joint document/word embedding
2025-04-05 16:07:59,177 - top2vec - INFO - Creating lower dimension embedding of documents
2025-04-05 16:08:09,320 - top2vec - INFO - Finding dense areas of documents
2025-04-05 16:08:09,500 - top2vec - INFO - Finding topics


In [61]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

[4744  184]


In [62]:
print(topic_nums)

[0 1]


In [63]:
topic_words, word_scores, topic_nums = model.get_topics(2)

In [64]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f"words: {words}")

0
words: ['depression' 'coping' 'depressive' 'depressed' 'anxiety' 'relapse'
 'suicidal' 'therapy' 'overcome' 'suicide' 'antidepressants' 'relapsed'
 'quitting' 'suffer' 'addiction' 'overwhelmed' 'therapist' 'bipolar'
 'recovery' 'psychiatrist' 'miserable' 'suffering' 'addicted' 'ocd'
 'anxious' 'dying' 'hopeless' 'advice' 'cope' 'stress' 'survive' 'self'
 'cravings' 'psych' 'struggle' 'quit' 'disorder' 'chronic' 'insomnia'
 'alone' 'struggling' 'stressed' 'diagnosed' 'heal' 'sober' 'stressful'
 'addict' 'medications' 'panic' 'medication']
1
words: ['so' 'er' 'yeah' 'oh' 'he' 'but' 'and' 'we' 'th' 'for' 'fucking' 'good'
 'of' 'she' 'that' 'my' 'its' 'ok' 'son' 've' 'okay' 'they' 'says' 'as'
 'im' 'this' 'now' 'their' 'great' 'then' 'it' 'wanna' 'isn' 'hey' 'well'
 'right' 'll' 'don' 'you' 'still' 'aren' 'last' 'to' 'the' 'or' 'other'
 'man' 'notice' 'after' 'actually']


##### Using BERTopic

In [29]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd

df = pd.read_csv("processed_reddit_data.csv")
df.dropna(subset=['proctext'], inplace=True)
docs = df['proctext'].tolist()
docs = [doc for doc in docs if len(doc.split()) > 3] 

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, n_gram_range=(1, 2), min_topic_size=10)

topics, probs = topic_model.fit_transform(docs)
print(topic_model.get_topic_info())

    Topic  Count                                      Name  \
0      -1   2196                    -1_feel_like_want_know   
1       0    762                   0_addict_drug_smoke_get   
2       1    186                      1_hate_life_want_die   
3       2    175           2_relationship_friend_feel_love   
4       3    172                       3_job_work_get_want   
5       4    132                     4_take_anxieti_mg_day   
6       5    116                   5_depress_feel_like_get   
7       6    113                6_feel_like_feel like_know   
8       7    104        7_mental_mental health_help_health   
9       8     67          8_heart_chest_symptom_heart rate   
10      9     62                   9_friend_feel_talk_like   
11     10     42                10_father_dad_parent_would   
12     11     41       11_panic_attack_panic attack_breath   
13     12     39               12_adhd_depress_get_diagnos   
14     13     34               13_sleep_night_anxieti_wake   
15     1

In [30]:
topic_1_words = topic_model.get_topic(-1)
print("Top words in the Outlier Topic:")
print(topic_1_words)

words_to_remove_from_topic_1 = [word for word, _ in topic_1_words]
print("\nWords to potentially remove:")
print(words_to_remove_from_topic_1)

Top words in the Outlier Topic:
[('feel', 0.011628962657444803), ('like', 0.01106535872297469), ('want', 0.009689918554817883), ('know', 0.009243674272186912), ('get', 0.009082980626051802), ('go', 0.008760037316339111), ('life', 0.007680984181544208), ('even', 0.0076293154999353355), ('would', 0.007204829020743607), ('thing', 0.0071073999414292)]

Words to potentially remove:
['feel', 'like', 'want', 'know', 'get', 'go', 'life', 'even', 'would', 'thing']


In [45]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
from nltk.corpus import stopwords

english_texts_df.dropna(subset=['proctext'], inplace=True)
docs = english_texts_df['proctext'].tolist()
docs = [doc for doc in docs if len(doc.split()) > 3]

stop_words = set(stopwords.words('english'))
custom_stopwords = ['feel', 'like', 'want', 'know', 'get', 'go', 'life', 'even', 'would', 'thing']

all_stopwords = stop_words.union(custom_stopwords)

def remove_stopwords(text):
    words = text.lower().split()
    filtered_words = [word for word in words if word not in all_stopwords and word.isalnum()]
    return " ".join(filtered_words)

docs_no_stopwords = [remove_stopwords(doc) for doc in docs]

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, n_gram_range=(1, 2), min_topic_size=10)

topics, probs = topic_model.fit_transform(docs_no_stopwords)
print(topic_model.get_topic_info())

    Topic  Count                                      Name  \
0      -1   2417                  -1_time_think_realli_tri   
1       0    771                   0_addict_drug_smoke_use   
2       1    186         1_heart_panic_attack_panic attack   
3       2    148                2_take_dose_effect_anxieti   
4       3     98           3_relationship_love_talk_realli   
5       4     82                4_depress_help_time_realli   
6       5     69                   5_friend_talk_peopl_say   
7       6     65                      6_mom_tri_dad_famili   
8       7     63              7_tire_tire tire_live_anymor   
9       8     59                   8_cut_wrist_kill_suicid   
10      9     58                   9_father_dad_parent_mom   
11     10     58              10_hate_hate hate_peopl_fuck   
12     11     52  11_mental health_health_mental_therapist   
13     12     49                 12_day_today_tomorrow_die   
14     13     41             13_job_work_interview_compani   
15     1

### Collecting outreach posts

In [46]:
!pip install emoji

Defaulting to user installation because normal site-packages is not writeable


In [60]:
import pandas as pd
import re
import emoji
from transformers import pipeline

reddit_df = pd.read_csv("reddit_comments.csv")
reddit_df.dropna(subset=['body'], inplace=True)

outreach_words_llm = [
    "reach out", "connect with", "support group", "talk to someone", "help line",
    "crisis line", "community support", "peer support", "available to chat",
    "if you need to talk", "we are here for you", "offering support",
    "seeking support", "you are not alone",
    "mental health support", "get help", "find support", "join us",
    "open to talking", "need someone to listen", "safe space",
    "helpline number", "text line", "call us", "message me",
    "online support", "local resources", "find a therapist"]

def remove_emojis(text):
    clean_text = emoji.replace_emoji(text, '')
    return clean_text

reddit_df['body_no_emoji'] = reddit_df['body'].apply(remove_emojis)

In [61]:
def identify_potential_outreach_posts(df, text_column='body_no_emoji', outreach_words=outreach_words_llm):
    df['outreach_confidence'] = 0
    for index, row in df.iterrows():
        text = str(row[text_column]).lower()
        confidence = 0
        for word in outreach_words:
            if re.search(r'\b' + re.escape(word.lower()) + r'\b', text):
                confidence += 1
        df.loc[index, 'outreach_confidence'] = confidence
    return df

potential_outreach_df = identify_potential_outreach_posts(reddit_df)
print("Potential Outreach Posts (based on keywords):\n", potential_outreach_df[['body', 'body_no_emoji', 'outreach_confidence']].head())

Potential Outreach Posts (based on keywords):
                                                 body  \
0  Hi everyone, I am recruiting participants for ...   
1  \ni literally feel like my mind cannot underst...   
2  if you live in Canada and need help with anxie...   
3  We are seeking individuals with depression and...   
4  I texted a suicide/crisis hotline because I ha...   

                                       body_no_emoji  outreach_confidence  
0  Hi everyone, I am recruiting participants for ...                    0  
1  \ni literally feel like my mind cannot underst...                    0  
2  if you live in Canada and need help with anxie...                    0  
3  We are seeking individuals with depression and...                    0  
4  I texted a suicide/crisis hotline because I ha...                    0  


In [62]:
high_confidence_df = potential_outreach_df[potential_outreach_df['outreach_confidence'] > 1]
print(high_confidence_df[['body', 'body_no_emoji', 'outreach_confidence']].head())

                                                  body  \
121  I (17) used to see a therapist for about a yea...   
201  Hello this is me trying to reach out to create...   
497  Starting now and for the next couple of days, ...   
695  I know theres something wrong with me, but i d...   
700  I haven't posted on Reddit before and I don't ...   

                                         body_no_emoji  outreach_confidence  
121  I (17) used to see a therapist for about a yea...                    2  
201  Hello this is me trying to reach out to create...                    3  
497  Starting now and for the next couple of days, ...                    2  
695  I know theres something wrong with me, but i d...                    2  
700  I haven't posted on Reddit before and I don't ...                    2  
