In [None]:
import sys
sys.path.append('../libs')

In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from model_evaluation_helpers import *
from visualization_helpers import *

In [None]:
%load_ext autoreload
%autoreload 2

### Load Data

In [None]:
path_to_annotations = "../dummy_data/text/annotated_tweets.csv"

# Load annotated tweets
annotations = pd.read_csv(path_to_annotations)
annotations = annotations.rename(columns={'nodeID':'tweet_id'})
annotations['label'] = annotations['topic'].astype(str) + ' - ' + annotations['stance'].astype(str)

In [None]:
path_to_model_embeddings = "../dummy_data/embeds/baseline_embs.csv"

In [None]:
ids, labels, docs, embs = bertopic_load_embeddings(path=path_to_model_embeddings, test_df=annotations)

### Run BERTopic

##### UMAP and HDBSCAN parameters

In [None]:
umap_args = {
    'n_neighbors': 15,
    'n_components': 10,
    'random_state':42
}

hdbscan_args = {
    'min_cluster_size': 15,
    'min_samples': 5,
    'metric': 'euclidean',
    'cluster_selection_method':'leaf'
}

In [None]:
# Arguments for umap and hdbscan
umap_model = UMAP(**umap_args)
hdbscan_model= HDBSCAN(**hdbscan_args)

# Vectorizer for text preprocessing
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [None]:
# BERTOpic Model
topic_model = BERTopic(
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model, 
    vectorizer_model=vectorizer_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True).fit(docs, embs)

In [None]:
topics, probs = topic_model.fit_transform(docs, embs)

In [None]:
doc_to_topic = pd.DataFrame(docs, columns=['doc'])
doc_to_topic['tweet_id'] = ids
doc_to_topic['label'] = labels
doc_to_topic['topic'] = topics
doc_to_topic['prob'] = probs