In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [None]:
class topic_modeling():
    
    def __init__(self, filename):
        self.name = filename
        self.df = pd.read_csv(filename+".csv")
        self.cluster_df = pd.DataFrame()
        self.df_info = pd.DataFrame()
        self.topics = ''
        self.topic_model = ''
        self.probs = 0
        
    def get_dataframe(self):
        return self.df

    def display(self):
        display(self.df.head())
    
    #Function to do topic modeling using BERT
    def bert_topic_modeling(self, min_cluster_size = 2500,min_samples=1000,n_neighbors=200):
        vectorizer_model = CountVectorizer(stop_words="english")
        umap_model = UMAP(n_neighbors=200, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
        hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
        seed_topic_list = [["trump", "donald", "conservative"],
                       ["biden", "joe", "china", "blue","liberal"]]
        self.topic_model = BERTopic(vectorizer_model=vectorizer_model,language="english", calculate_probabilities=True, verbose=True, hdbscan_model = hdbscan_model, umap_model = umap_model, seed_topic_list=seed_topic_list)
        self.topics, self.probs = self.topic_model.fit_transform(self.df['text'])
    
    #Save models is used to save the topic models, clustered topic model and 
    def save_models(self, name):
        self.topic_model.save(name)
        self.cluster_df = pd.DataFrame(data={"docs":self.df['text'],"topics":self.topics})
        self.cluster_df.to_csv(name+"_dataframe"+".csv")
        self.df_info = self.topic_model.get_topic_info()
        self.df_info.to_csv(name+"_info"+".csv")
    
    def load_model(self,filename):
        #topic_model = BERTopic.load(filename)
        self.cluster_df = pd.read_csv(filename+"_dataframe"+".csv")
        self.df_info = pd.read_csv(filename+"_info"+".csv")
        
    def analyze_topics(self,n,visualise_topics=False,visualize_hierarchy=False):
        print("Displaying the types of topics")
        display(self.topic_model.get_topic_info())
        print("Displaying the import topic keyword probability")
        display(self.topic_model.get_topic(n))
        if visualise_topics:
            self.topic_model.visualize_topics()
        if visualize_hierarchy:
            self.topic_model.visualize_hierarchy()

## Topic Modeling

In [None]:
obj = topic_modeling('trimmed_pre_election_punctuation_CNN_20')
obj.display()

In [None]:
obj.bert_topic_modeling() # Deafult parameters (min_cluster_size = 2500,min_samples=1000,n_neighbors=200)

In [None]:
obj.save_models("topic_model_2500") # Saving

In [None]:
obj.load_model("topic_model_2500") #Loading

In [None]:
obj.analyze_topics(0) # Default parameters (get_topic = n,visualise_topics=False,visualize_hierarchy=False)

In [None]:
df = pd.read_csv("topic_model_2500_dataframe.csv")

In [None]:
df['topics'].value_counts()