In [6]:
!pip install bertopic



In [7]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
class topic_modeling():
    
    def __init__(self, filename):
        self.name = filename
        self.df = pd.read_csv(filename+".csv")
        self.cluster_df = pd.DataFrame()
        self.df_info = pd.DataFrame()
        self.topics = ''
        self.topic_model = ''
        self.probs = 0
        
    def get_dataframe(self):
        return self.df

    def display(self):
        display(self.df.head())
    
    #Function to do topic modeling using BERT
    def bert_topic_modeling(self, min_cluster_size = 2500,min_samples=1000,n_neighbors=200):
        vectorizer_model = CountVectorizer(stop_words="english")
        umap_model = UMAP(n_neighbors=200, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
        hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
        seed_topic_list = [["trump", "donald", "conservative"],
                       ["biden", "joe", "china", "blue","liberal"]]
        self.topic_model = BERTopic(vectorizer_model=vectorizer_model,language="english", calculate_probabilities=True, verbose=True, hdbscan_model = hdbscan_model, umap_model = umap_model, seed_topic_list=seed_topic_list)
        self.topics, self.probs = self.topic_model.fit_transform(self.df['text'])
    
    #Save models is used to save the topic models, clustered topic model and 
    def save_models(self, name):
        self.topic_model.save("Output/"+name)
        self.cluster_df = pd.DataFrame(data={"docs":self.df['text'],"topics":self.topics})
        self.cluster_df.to_csv("Output/"+name+"_dataframe"+".csv")
        self.df_info = self.topic_model.get_topic_info()
        self.df_info.to_csv("Output/"+name+"_info"+".csv")
    
    def load_model(self,filename):
        #topic_model = BERTopic.load(filename)
        self.cluster_df = pd.read_csv("Output/"+filename+"_dataframe"+".csv")
        self.df_info = pd.read_csv("Output/"+filename+"_info"+".csv")
        
    def analyze_topics(self,n,visualise_topics=False,visualize_hierarchy=False):
        print("Displaying the types of topics")
        display(self.topic_model.get_topic_info())
        print("Displaying the import topic keyword probability")
        display(self.topic_model.get_topic(n))
        if visualise_topics:
            self.topic_model.visualize_topics()
        if visualize_hierarchy:
            self.topic_model.visualize_hierarchy()

## Topic Modeling

In [13]:
obj = topic_modeling('Output/trimmed_pre_election_punctuation_CNN_20')
obj.display()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,videoId,commentId,text,likeCount,totalReplyCount,author,timeline
0,0,0,-74iHBJy4xU,UgxoHVgvHDNLkt6TXDh4AaABAg,vote early! the majority of america is sick an...,4,1,Frank Merfalen,2020-10-19 00:00:01+00:00
1,1,1,xmW00oZoNcM,UgxLObImDQVnPGNXRbB4AaABAg,putting them in the worst possible light . \nr...,0,0,morning coffee,2020-10-19 00:00:24+00:00
2,2,2,oTLNgK2j-Bk,UgwDsb_F81D0Ce9W-Od4AaABAg,debates hands down ! the fly landed on mike pe...,1,0,Roman Ward,2020-10-19 00:01:11+00:00
3,3,3,-74iHBJy4xU,Ugw_1P5ss3TXcMU5P6J4AaABAg,joe biden does not do anything how he going to...,0,0,Hmong Shee Yee,2020-10-19 00:01:17+00:00
4,4,4,dC6PxLJ3dDU,UgyfpHlIaf_iVhpX8ER4AaABAg,what part of mail in voting and absentee votin...,0,0,Allan Cresswell,2020-10-19 00:01:19+00:00


In [None]:
obj.bert_topic_modeling() # Deafult parameters (min_cluster_size = 2500,min_samples=1000,n_neighbors=200)

In [14]:
obj.save_models("topic_model_2500") # Saving

In [None]:
#obj.load_model("topic_model_2500") #Loading

In [None]:
obj.analyze_topics(0) # Default parameters (get_topic = n,visualise_topics=False,visualize_hierarchy=False)

In [None]:
df = obj.get_dataframe()

In [None]:
df['topics'].value_counts()