# BERTopic
### Extensive documentation on https://maartengr.github.io/BERTopic/index.html

In [67]:
from bertopic import BERTopic

In [63]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

In [10]:
df_posts = pd.read_excel('../data/raw/posts_20220712.xlsx')
df_comments = pd.read_excel('../data/raw/comments_20220712.xlsx')

In [11]:
df_posts['type'] = 'post'
df_comments['type'] = 'comment'

In [12]:
df_posts.head()

Unnamed: 0.1,Unnamed: 0,query,subreddit,id,author,title,date,body,n_comments,upvotes,type
0,122,science,statin,690frt,[deleted],A new study helps debunk the widespread belief...,1493819266,[deleted],0,0,post
1,1688,skeptic,statin,3yhkcr,FutureFormerRedditor,Are statin drugs the cause of widespread decli...,1451280370,,2,0,post
2,357,keto,statin,6kynru,Cetaphil911,Blood test help...,1499069941,I do not have the specific numbers right now b...,13,0,post
3,734,Health,statin,7t4yk,quakerorts,High-dose statin therapy promotes tumor growth,1233170813,,0,0,post
4,1100,Paleo,statin,qnmcy,zak_on_reddit,On 3/8/12 Limpdick Limbaugh is praising Paleo ...,1331231247,On THU 3/8 I'm listening to the Hillbilly Hero...,4,0,post


In [13]:
df_comments.head()

Unnamed: 0.1,Unnamed: 0,query,subreddit,id,author,title,body,date,link_id,parent_id,upvotes,type
0,0,ketoscience,statin,icnalcx,dirtyloophole,,I completely agree that there is a lot of BS Y...,1655424021,t3_rnqlnj,t1_i9t0ktq,1,comment
1,1,ketoscience,statin,i9su4gy,Etadenod,,bravo. this is the best answer why you should ...,1653396317,t3_rnqlnj,t1_hpxgazc,1,comment
2,2,ketoscience,statin,i9stx7r,Etadenod,,Evidence supports the idea that statin therapy...,1653396204,t3_rnqlnj,t1_hptsfui,1,comment
3,3,ketoscience,statin,i8gq5ls,Triabolical_,,There is unfortunately not great research on t...,1652456719,t3_uo0wco,t1_i8fwjc8,1,comment
4,4,ketoscience,statin,i60eyz4,sdavis484,,If you take a statin? Weird correlation.,1650814542,t3_uawfdn,t1_i60c4fz,1,comment


In [15]:
df = df_posts.append(df_comments)

In [16]:
df.shape

(10553, 13)

In [17]:
df.columns

Index(['Unnamed: 0', 'query', 'subreddit', 'id', 'author', 'title', 'date',
       'body', 'n_comments', 'upvotes', 'type', 'link_id', 'parent_id'],
      dtype='object')

In [18]:
df.columns = ['index', 'subreddit', 'query', 'id', 'author', 'title', 'date',
       'body', 'n_comments', 'upvotes', 'type', 'link_id', 'parent_id']

In [19]:
df.head(10)

Unnamed: 0,index,subreddit,query,id,author,title,date,body,n_comments,upvotes,type,link_id,parent_id
0,122,science,statin,690frt,[deleted],A new study helps debunk the widespread belief...,1493819266,[deleted],0.0,0,post,,
1,1688,skeptic,statin,3yhkcr,FutureFormerRedditor,Are statin drugs the cause of widespread decli...,1451280370,,2.0,0,post,,
2,357,keto,statin,6kynru,Cetaphil911,Blood test help...,1499069941,I do not have the specific numbers right now b...,13.0,0,post,,
3,734,Health,statin,7t4yk,quakerorts,High-dose statin therapy promotes tumor growth,1233170813,,0.0,0,post,,
4,1100,Paleo,statin,qnmcy,zak_on_reddit,On 3/8/12 Limpdick Limbaugh is praising Paleo ...,1331231247,On THU 3/8 I'm listening to the Hillbilly Hero...,4.0,0,post,,
5,1096,Paleo,statin,25oqkv,x3000gtx,Does anyone take statins?,1400213170,I only take statins so I can eat 5x more grass...,7.0,0,post,,
6,1092,Paleo,statin,4pogrr,Polskihammer,[Question] I need your help r/paleo to lower m...,1466793418,It's not too sky high. 150/90 today. I know th...,7.0,0,post,,
7,1408,Cholesterol,statin,cy2or9,amazingfacts1990,Are you using these drugs for cholesterol ? Th...,1567294360,The Zetia or Ezetimibe is a drug used to treat...,1.0,0,post,,
8,44,ketoscience,statin,d5dtyw,GlobeOShimmer,Did someone say statin?,1568707351,,0.0,0,post,,
9,273,keto,statin,bggy6m,namtrag,Lipid panel results and low dose statins,1556029294,my blood work 9 months in is horrible for many...,33.0,0,post,,


In [20]:
df.subreddit.value_counts()

keto                   2375
Cholesterol            2242
diabetes                888
science                 753
ketoscience             573
nutrition               507
news                    471
ScientificNutrition     446
todayilearned           402
conspiracy              388
Supplements             371
Health                  321
PlantBasedDiet          233
askscience              168
COVID19                 144
Paleo                   115
longevity                78
skeptic                  69
stopusingstatins          9
Name: subreddit, dtype: int64

In [21]:
# Fill empty cells, combine title and body, and remove some weird html tags
df['body'] = df['body'].fillna('')
df['title'] = df['title'].fillna('')
df['combi'] = df['title'] + '. ' + df['body']
df.combi = df.combi.str.replace("http\S+", "")
df.combi = df.combi.str.replace("\\n", " ")
df.combi = df.combi.str.replace("&gt;", "") 

In [22]:
df['combi'].describe()

count     10553
unique    10474
top          . 
freq         11
Name: combi, dtype: object

In [23]:
# Turn into list
texts = df['combi']
texts_list = texts.to_list()

In [24]:
# Calculate embeddings, so you don't have to do this every time you run the topic model
from sentence_transformers import SentenceTransformer

# Prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(texts_list, show_progress_bar=True)

Batches:   0%|          | 0/330 [00:00<?, ?it/s]

In [137]:
# If needed, this can be used to increase or decrease cluster size by changing the 'min_samples' parameter.
# Higher number means less clusters. Standard is 10.
#from hdbscan import HDBSCAN

#hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', 
#                        cluster_selection_method='eom', prediction_data=True, min_samples=10)

In [73]:
# Train our topic model using our pre-trained sentence-transformers embeddings and UMAP to be able to provide a random seed
# nr_topics="auto" activates an automatic topic reduction method
umap_model = UMAP(n_neighbors=30, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(umap_model=umap_model, nr_topics="auto", calculate_probabilities=True)
topics, probs = topic_model.fit_transform(texts_list, embeddings)

In [105]:
# Or train model on new embeddings (run either the previous cell or this cell)
#vectorizer_model = CountVectorizer(stop_words="english")
#topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True, calculate_probabilities=True)
#topics, probs = topic_model.fit_transform(texts_list)

Batches:   0%|          | 0/330 [00:00<?, ?it/s]

2022-07-14 15:40:36,437 - BERTopic - Transformed documents to Embeddings
2022-07-14 15:40:51,102 - BERTopic - Reduced dimensionality
2022-07-14 15:40:51,661 - BERTopic - Clustered reduced embeddings


In [74]:
# Run this cell to load a pretrained topic model!
#topic_model = BERTopic.load("../models/topic_model_reduced_topics")
#probs = topic_model.hdbscan_model.probabilities_
#topics = topic_model._map_predictions(topic_model.hdbscan_model.labels_)

In [77]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4382,-1_the_of_to_and
1,0,2797,0_my_and_to_keto
2,1,233,1_statin_you_nation_take
3,2,224,2_crestor_side_it_my
4,3,194,3_statin_you_on_doctor
...,...,...,...
60,59,12,59_diabetes_risk_n4683_invite
61,60,11,60_women_ci_men_vascular
62,61,11,61_them_dont_prescription_pharmacist
63,62,10,62_lipitor_zocor_crestor_killer


In [76]:
topic_model.visualize_hierarchy()

In [34]:
#topic_model.visualize_heatmap()

In [75]:
topic_model.visualize_topics()

In [250]:
# manual assignment of topics to clusters
#df_topics['clusters'][df_topics['topics']==70] = 7

In [49]:
# Check if any topics are unassigned
#df_topics['topics'][df_topics['clusters']==0].unique()

In [78]:
topic_model.get_representative_docs(37)

[". Even before COVID-19, statins produce more adverse side effects than help in most people.  Sorry but this is totally inaccurate. While every drug has adverse effects (if it doesn't, then it doesn't work) statins are among the more well tolerated drugs and have shown mortality benefits both in the long term of CAD and HLD management as well is in the short term such as after acute MI. The rate of myalgia is low but not negligible, and is often manageable by a change in dosing or a change to a different statin. Overt myositis and rhabdomyolysis are rare events.",
 '. There is a lot of talk about the nocebo effect. A really interesting application of this effect which has been recently studied is in the drug class of statins. Statins are associated with the potential of causing a variety of myotoxic effects such as myalgia and very rarely rhabdomyolysis. This side effect is fairly well known by statin takers and millions of people have this effect.   There was a great study published 

In [145]:
#topic_model.save('topic_model_reduced_topics_2')

## Try with bigger n_gram range

In [79]:
# Reassign topic keywords by increasing ngram range
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 4), stop_words="english")
topic_model.update_topics(texts_list, topics, vectorizer_model=vectorizer_model)

In [80]:
topic_model.visualize_hierarchy()

In [146]:
# Create document with topics and their keywords
topic_info_df = topic_model.get_topic_info()
topic_info_df.to_csv('topic_info_df_reduced_topics_2.csv')

# Assign outliers to topics

In [100]:
import numpy as np
probability_threshold = 0.01
new_topics = [np.argmax(prob) if max(prob) >= probability_threshold else -1 for prob in probs]

In [112]:
# Print number of outliers before and after assignment
print('Outliers before: ', topics.count(-1), '\nOutliers after: ', new_topics.count(-1))

Outliers before:  4382 
Outliers after:  1094


In [97]:
# test if there are texts to which multiple topics could be assigned
# current threshold: if the second topic has a probability that is less than 10% smaller
multiple_topics = []
for i in probs:
    prob = []
    if sorted(probs[0], reverse=True)[1] >= sorted(probs[0], reverse=True)[0]*0.9:
        prob.append(sorted(probs[0], reverse=True)[0])
        prob.append(sorted(probs[0], reverse=True)[1])
        if sorted(probs[0], reverse=True)[2] >= sorted(probs[0], reverse=True)[0]*0.9:
            prob.append(sorted(probs[0], reverse=True)[2])
            if sorted(probs[0], reverse=True)[3] >= sorted(probs[0], reverse=True)[0]*0.9:
                prob.append(sorted(probs[0], reverse=True)[3])
                if sorted(probs[0], reverse=True)[4] >= sorted(probs[0], reverse=True)[0]*0.9:
                    prob.append(sorted(probs[0], reverse=True)[4])
    multiple_topics.append(prob)

In [99]:
# prints the instances with multiple topics. If empty, none of the texts have multiple topics.
for i in multiple_topics:
    if i:
        print(i)

# Create df from topic model

In [148]:
# Create df with texts and their assigned topics
topic_docs = {topic: [] for topic in set(topics)}
for topic, doc in zip(topics, texts_list):
    topic_docs[topic].append(doc)

In [149]:
docs_list = []
topics_list = []
keywords_list = []

for i in range(len(topic_docs.keys())):
    for j in topic_docs[i-1]:
        docs_list.append(j)
        topics_list.append(i-1)
        keywords_list.append(topic_model.get_topic(i-1))

In [150]:
df_topics = pd.DataFrame({'texts':docs_list, 'topics':topics_list, 'keywords':keywords_list})

In [252]:
df_topics.to_csv('df_topics_reduced_topics.csv')

## Try with k-means instead of HDBScan (to reduce outliers)

In [43]:
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=70)
k_means_topic_model = BERTopic(hdbscan_model=cluster_model)

In [44]:
k_means_topics, k_means_probs = k_means_topic_model.fit_transform(texts_list, embeddings)

In [45]:
k_means_topic_model.visualize_topics()

In [46]:
k_means_topic_model.visualize_hierarchy()

## Try with seeded topics

In [26]:
seed_topic_list = [['myalgia', 'muscle', 'pain', 'weakness', 'creatine kinase', 'CK',  'nausea', 'vomiting', 'indigestion', 'liver enzymes', 'LFT', 'AST', 'ALT', 'memory loss', 'brain fog', 'irritable', 'dementia', 'poison', 'kill', 'side effect', 'infection'],
                   ['reduce', 'lower', 'improve', 'cholesterol', 'LDL', 'HDL', 'stroke', 'heart attack', 'artery', 'clean', 'triglyceride'],
                    ['diet', 'exercise', 'carbs', 'carbohydrates', 'fat', 'red meat', 'keto', 'fasting'],
                  ['expensive', 'price', 'profit', 'insurance', 'bill', 'money', 'poor'],
                  ['cardiologist', 'primary care', 'dentist'],
                  ['dose', 'interaction', 'other medications', 'antibiotic'],
                  ['diabetes', 'obesity', 'stroke']]

seeded_topic_model = BERTopic(seed_topic_list=seed_topic_list)
seeded_topics, seeded_probs = seeded_topic_model.fit_transform(texts)

2022-06-27 17:29:47,174 - BERTopic - Transformed documents to Embeddings
2022-06-27 17:29:57,294 - BERTopic - Reduced dimensionality
2022-06-27 17:29:57,365 - BERTopic - Clustered reduced embeddings


In [28]:
seeded_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,827,-1_to_and_the_my
1,0,157,0_statins_statin_the_prevention
2,1,89,1_statins_cholesterol_of_the
3,2,81,2_ldlc_of_the_risk
4,3,63,3_my_cholesterol_and_ldl
5,4,56,4_my_ldl_paleo_and
6,5,53,5_10mg_my_dose_rosuvastatin
7,6,49,6_keto_my_ldl_cholesterol
8,7,41,7_drugs_drug_the_industry
10,8,39,8_yeast_rice_red_cholesterol


In [29]:
seeded_topic_docs = {topic: [] for topic in set(topics)}
for topic, doc in zip(topics, texts_list):
    seeded_topic_docs[topic].append(doc)

In [30]:
seeded_docs_list = []
seeded_topics_list = []
seeded_keywords_list = []

for i in range(len(seeded_topic_docs.keys())):
    for j in seeded_topic_docs[i-1]:
        seeded_docs_list.append(j)
        seeded_topics_list.append(i-1)
        seeded_keywords_list.append(topic_model.get_topic(i-1))

In [31]:
seeded_df_topics = pd.DataFrame({'texts':seeded_docs_list, 'topics':seeded_topics_list, 'keywords':seeded_keywords_list})

In [32]:
seeded_df_topics.to_csv('seeded_df_topics.csv')

## Try with r/cholesterol instead of other dataset

In [33]:
df_chol_posts = pd.read_excel('/Users/mariekevb/Desktop/Stanford/statins/data/posts_rcholesterol_20220623.xlsx')
df_chol_comments = pd.read_excel('/Users/mariekevb/Desktop/Stanford/statins/data/comments_rcholesterol_20220623.xlsx')

In [34]:
df_chol_comments.columns

Index(['Unnamed: 0', 'query', 'subreddit', 'id', 'author', 'title', 'body',
       'date', 'link_id', 'parent_id', 'upvotes'],
      dtype='object')

In [35]:
df_chol_posts['body'] = df_chol_posts['body'].fillna('')
df_chol_posts['combi'] = df_chol_posts['title'] + '. ' + df_chol_posts['body']
df_chol_posts.combi = df_chol_posts.apply(lambda row: re.sub(r"http\S+", "", row.combi).lower(), 1)
df_chol_posts.combi = df_chol_posts.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.combi.split())), 1)

In [36]:
chol_texts = df_chol_posts['combi']
chol_texts = chol_texts.append(df_chol_comments['body'])
chol_texts_list = chol_texts.to_list()

In [37]:
chol_topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)
chol_topics, chol_probs = chol_topic_model.fit_transform(chol_texts_list)

Batches:   0%|          | 0/83 [00:00<?, ?it/s]

2022-06-27 17:30:53,236 - BERTopic - Transformed documents to Embeddings
2022-06-27 17:31:05,722 - BERTopic - Reduced dimensionality
2022-06-27 17:31:05,824 - BERTopic - Clustered reduced embeddings


In [39]:
chol_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1026,-1_cholesterol_statin_ldl_high
1,0,166,0_ldl_statin_diet_high
2,1,149,1_crestor_taking_effects_ive
3,2,70,2_cholesterol_statin_high_high cholesterol
4,3,64,3_im_cholesterol_years_diet
5,4,61,4_cholesterol_diet_like_high
6,5,59,5_statin_life_taking statin_statin statin
7,6,55,6_statin_diet_months_exercise
8,7,52,7_lipitor_80mg_effects_different
9,8,50,8_cholesterol_pain_heart_statins


## Use below to search through topics

In [40]:
# get top words for specific topic
topic_model.get_topic(topic =0)

[('statins', 0.0443514906529007),
 ('statin', 0.03552898941863199),
 ('cardiovascular', 0.017146035487514713),
 ('primary prevention', 0.015819716478343872),
 ('prevention', 0.014476579661165027),
 ('statin discontinuation', 0.014028100402538966),
 ('study', 0.014015372996630563),
 ('event', 0.013959535781059256),
 ('discontinuation', 0.013747472771007617),
 ('primary', 0.01270304876821225)]

In [41]:
# get representative documents for specific topic
topic_model.get_representative_docs(topic=0)

["statins' flawed studies and false advertising.",
 'esselstyn: the problems with statin drugs .',
 'statin wars: have we been misled about the evidence? a narrative review [summary].']