# Implementing BERTopic to identify Topic Labels

In [199]:
from collections import Counter
import itertools

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer)
from hdbscan import HDBSCAN

In [67]:
data = pd.read_csv("../data/processed/reviews.csv")
text = list(data["partially_cleaned_text"])

In [122]:
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
hdbscan_model = HDBSCAN(min_cluster_size=50,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)

model = BERTopic(vectorizer_model=vectorizer_model,
                 ctfidf_model=ctfidf_model,
                 hdbscan_model=hdbscan_model,
                 nr_topics=27) # 27 topics were chosen as various permutations were tried - the model could find up to 27 topics if a minimum cluster size of 50 was used.
topics, probs = model.fit_transform(text)

In [127]:
Counter(topics)

Counter({19: 70,
         22: 63,
         1: 319,
         7: 141,
         4: 160,
         8: 124,
         -1: 2102,
         6: 142,
         17: 77,
         3: 195,
         9: 123,
         5: 159,
         10: 120,
         0: 353,
         14: 89,
         12: 102,
         2: 273,
         26: 57,
         13: 102,
         15: 86,
         24: 61,
         23: 63,
         25: 60,
         16: 85,
         11: 115,
         21: 64,
         18: 75,
         20: 64})

In [123]:
data['pred_topic'] = topics

In [244]:
custom_sw = ['great', 'taste', 'good', 'like', 'product', 'flavor',
             'love', 'really', 'buy', 'tastes', 'better', 'best',
             'tried', 'use', 'eat', 'food', 'make']
my_stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_sw))

In [245]:
def get_top_words(corpus):
    vec = TfidfVectorizer(stop_words=my_stop_words,
                          sublinear_tf=True).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,
                        key=lambda x: x[1],
                        reverse=True)
    return pd.DataFrame(words_freq[:7],
                        columns=["top words", "count"])

In [240]:
get_top_words(data["cleaned_text"])

Unnamed: 0,top words,count
0,price,107.4919
1,coffee,105.827711
2,tea,87.658198
3,amazon,72.196191
4,little,69.018259
5,time,68.237516
6,tasty,67.418378


In [246]:
def visualise_top_words(df, n_topics):

    colors = itertools.cycle(["#D55E00",
                              "#0072B2",
                              "#CC79A7",
                              "#E69F00",
                              "#56B4E9",
                              "#009E73",
                              "#F0E442"])

    subplot_titles = [f"Topic {topic}" for topic in range(n_topics)]
    columns = 4
    rows = int(np.ceil(n_topics/columns))
    fig = make_subplots(rows=rows,
                        cols=columns,
                        shared_xaxes=False,
                        horizontal_spacing=.1,
                        vertical_spacing=.4 / rows if rows > 1 else 0,
                        subplot_titles=subplot_titles)

    row = 1
    column = 1
    for topic in range(n_topics):
        topic_corpus = df.query(f"pred_topic == {topic}")
        freq_df = get_top_words(topic_corpus["cleaned_text"])

        fig.add_trace(
            go.Bar(x=freq_df["count"],
                   y=freq_df["top words"],
                   orientation='h',
                   marker_color=next(colors)),
            row=row, col=column)

        if column == columns:
            column = 1
            row += 1
        else:
            column += 1

    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        title={
            'text': "Top Words",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=1000,
        height=250*rows if rows > 1 else 250 * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    return fig

In [242]:
visualise_top_words(data, 27)
# BERTopic performs quite well compared to other models.
# This visualisation is produced using a function written by our team.

In [134]:
# Produced using BERTopic's innate EDA function.
# More interpretable as it uses word vectors.
model.visualize_barchart(top_n_topics=27)