In [1]:
import random

In [2]:
random_seed = 10
random.seed(random_seed)

In [3]:
%pip install swifter
%pip install python-dotenv
%pip install bertopic
%pip install tensorflow
%pip install sentence-transformers
%pip install gensim



In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
import numpy as np
np.random.seed(10)
import pandas as pd
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
import string
import plotly.express as px
import matplotlib.pyplot as plt

from datetime import datetime
import logging
import os
import sys
import time
import re

from dotenv import load_dotenv
import swifter
from sentence_transformers import SentenceTransformer


from bertopic import BERTopic
import joblib
from sklearn.preprocessing import normalize
from wordcloud import WordCloud

In [6]:
#shared drive version at everyone can set up
df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/palm_oil_grievance_logs.csv')
#df.head(5)

## Preprocessing

In [7]:
#stop words, lower case, and remove punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# preprocessing function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    # Use regular expression to replace all punctuation with a space
    text = re.sub(f'[{re.escape("".join(punctuation))}]', ' ', text)
    tokens = text.split()
    # Removed the strip() call as regex handles punctuation removal
    # tokens = [word.strip("".join(punctuation)) for word in tokens]
    tokens = [word for word in tokens if word and word not in stop_words]

    return tokens

# Add cleaned tokens to the df
df["tokens"] = df["summary"].apply(clean_text)

# Define the list of words to filter out
words_to_filter = {'nan', 'pt','report','rspo','alleged', 'palm', 'oil', 'company', 'community', 'complainant',
                   'companies', 'also', 'without', 'group', 'allegedly', 'period', 'respondent', 'reported',
                   'mentioned', '2019', '19', '2020', 'subsidary', 'alleges'}

# filter out words that we think are not helpful
df['flt_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in words_to_filter])


df['flt_tokens'] = df['flt_tokens'].apply(lambda x: ' '.join(x))

## Embedding

In [8]:
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

BATCH_SIZE = 100
output_dir = "../data/intermediate"
os.makedirs(output_dir, exist_ok=True)

# load model once
model = SentenceTransformer('all-MiniLM-L6-v2')

# define embedding function
def get_embedding(text) -> list:
    if not text:
        return None
    try:
        return model.encode(text)
        #adding this to see the shape of the embedding
        logging.info(f"Embedding shape for input: {emb.shape}")
        return emb
    except Exception as e:
        logging.error(f"Error getting embedding for text: {str(text)[:50]}... Error: {e}")
        return None

# main batch embedding function
def run_local_embeddings():
    formatted_datetime = datetime.now().strftime("%d_%b_%Y_%H_%M_%S")
    n = len(df)

    if 'embedding' not in df.columns:
        df['embedding'] = None

    df_start = 0
    while df_start < n:
        df_intermediate = df[df_start:df_start + BATCH_SIZE].copy()

        # Only process rows that do NOT have numpy array embeddings yet
        df_intermediate_unprocessed = df_intermediate.loc[
            df_intermediate['embedding'].apply(lambda x: not isinstance(x, np.ndarray))
        ]
        unprocessed_rows = len(df_intermediate_unprocessed)

        if unprocessed_rows == 0:
            logging.info(f"No unprocessed rows in batch starting at {df_start}")
        else:
            logging.info(f"Running embeddings on {unprocessed_rows} rows in batch starting at {df_start}")
            try:
                df_intermediate_unprocessed["embedding"] = df_intermediate_unprocessed["flt_tokens"].swifter.apply(
                    get_embedding
                )

                successful = df_intermediate_unprocessed.loc[
                    df_intermediate_unprocessed['embedding'].apply(lambda x: isinstance(x, np.ndarray))
                ]
                df.loc[successful.index, 'embedding'] = successful['embedding']

            except Exception as exc:
                logging.exception(f"Exception during embedding: {exc}")

        # save partial batch
        df.to_pickle(f"{output_dir}/embeddings_partial_{df_start}_{formatted_datetime}.pkl")
        df_start += BATCH_SIZE

    # save full dataset
    df.to_pickle(f"{output_dir}/embeddings_full_{formatted_datetime}.pkl")
    logging.info("Finished all batches and saved full dataset.")

run_local_embeddings()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/30 [00:00<?, ?it/s]

In [9]:
# embeddings are computed and saved in df
embedding_example = df.loc[df['embedding'].notnull(), 'embedding'].iloc[0]

print(type(embedding_example))  # we want a numpy array here
print(embedding_example.shape)  # This will tell us the embedding dimensions :)

<class 'numpy.ndarray'>
(384,)


## Model Creation

In [10]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [11]:
%pip install scikit-fuzzy

Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/920.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0


In [12]:
from skfuzzy.cluster import cmeans
from sentence_transformers import SentenceTransformer
import umap
import numpy as np

In [13]:
# # Define Clustering Models
# soft_kmeans_model = GaussianMixture(n_components=20, random_state=42)
class FuzzyCMeansModel:
    def __init__(self, n_clusters=6, m=2, error=0.005, maxiter=1000):
        self.n_clusters = n_clusters
        self.m = m
        self.error = error
        self.maxiter = maxiter
        self.labels_ = None
        self.centers = None
        self.u = None  # Membership matrix

    def fit(self, X):
        self.centers, self.u, _, _, _, _, _ = cmeans(
            X.T, self.n_clusters, self.m, error=self.error, maxiter=self.maxiter
        )
        self.labels_ = np.argmax(self.u, axis=0)
        return self

    def predict(self, X):
        return self.labels_

In [14]:
## bertopic.py ##
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

# Filter out rows where the 'embedding' is None
df_embeddings = df.dropna(subset=['embedding']).copy()

# Normalize embeddings and store them in a new column
df_embeddings['embedding_normalized'] = df_embeddings['embedding'].apply(
  lambda x: normalize([x], norm='l2')[0]
)
embeddings_array = np.array(df_embeddings['embedding_normalized'].tolist())

# Use the cleaned text for docs
docs = df_embeddings['flt_tokens'].tolist()

# Initialize UMAP model with random_state for reproducibility
umap_model = UMAP(random_state=random_seed)

# Initialize Clustering Model (Fuzzy C-Means)
cluster_model = FuzzyCMeansModel(n_clusters= 6)

# Initialize BERTopic model with the UMAP model and the custom clustering model
# random_state is removed as it's not supported in this BERTopic version
bertopic_model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model)

# Fit model
topics, probs = bertopic_model.fit_transform(docs, embeddings_array)

#fix for indexing, store the actual topic ids assigned by bert
df_embeddings["bertopic_topic"] = topics
df_embeddings["bertopic_prob"] = probs

# Get actual fuzzy topic probabilities from the Fuzzy C-Means model
membership_probs = cluster_model.u.T  # shape = (n_docs, n_topics)

# Turn it into a DataFrame
membership_df = pd.DataFrame(membership_probs, columns=[f"Topic_{i}" for i in range(cluster_model.n_clusters)])

# Join with df_embeddings
df_embeddings = df_embeddings.reset_index(drop=True)
df_embeddings = pd.concat([df_embeddings, membership_df], axis=1)

# Now you can access per-topic probabilities per document
df_embeddings.head()

df_embeddings["topic"] = membership_df.idxmax(axis=1).apply(lambda x: int(x.split("_")[1]))
df_embeddings["probs"] = membership_df.max(axis=1)


# Print the topics and their top words after fitting the model
print("BERTopic Topics and Top Words:")
print(bertopic_model.get_topics())

# Get the current datetime for the filename
formatted_datetime = datetime.now().strftime("%d_%b_%Y_%H_%M_%S")

# Create the output directory for embeddings if it doesn't exist
output_embeddings_dir = "../data/embeddings"
os.makedirs(output_embeddings_dir, exist_ok=True)

# Save dataframe with topics and probabilities
df_embeddings.to_csv(
  f"{output_embeddings_dir}/feedback_embeddings_bertopic_{formatted_datetime}.csv",
  index=False
)

# Create the output directory for models if it doesn't exist
output_models_dir = "../models"
os.makedirs(output_models_dir, exist_ok=True)

# save the model for future use on unseen data
joblib.dump(
  bertopic_model,
  f"{output_models_dir}/bertopic_model_{formatted_datetime}.joblib"
)

print("BERTopic model worked.")

BERTopic Topics and Top Words:
{0: [('land', 0.1138992473541997), ('allegations', 0.03209267730437707), ('plasma', 0.03137472926157074), ('local', 0.030499809427582795), ('rights', 0.029896645794520164), ('communities', 0.029537509541049174), ('compensation', 0.024889792445876295), ('village', 0.024202411551660162), ('dispute', 0.023980681156422847), ('process', 0.02256684367515618)], 1: [('earth', 0.10378008177385555), ('mighty', 0.10293457816266245), ('ha', 0.09896097778253168), ('rapid', 0.0987394326743677), ('response', 0.09155289963379781), ('concession', 0.08751044412672791), ('deforestation', 0.08276517529126184), ('indications', 0.07053024119075019), ('forest', 0.06168771382390343), ('cleared', 0.056141253604897474)], 2: [('eof', 0.06004731469294873), ('forest', 0.04719103427031122), ('mill', 0.04199647901990493), ('gar', 0.03969680694345143), ('illegal', 0.03892162337720949), ('plantation', 0.03843452668755482), ('plantations', 0.035102374720553255), ('bukit', 0.03330659255519

In [15]:
print(bertopic_model.get_topic_info())

   Topic  Count                               Name  \
0      0     77    0_land_allegations_plasma_local   
1      1     75            1_earth_mighty_ha_rapid   
2      2     69              2_eof_forest_mill_gar   
3      3     60               3_hcv_land_river_npp   
4      4     53  4_workers_labour_union_employment   
5      5     53    5_gar_deforestation_linked_jaya   

                                      Representation  \
0  [land, allegations, plasma, local, rights, com...   
1  [earth, mighty, ha, rapid, response, concessio...   
2  [eof, forest, mill, gar, illegal, plantation, ...   
3  [hcv, land, river, npp, forest, new, communiti...   
4  [workers, labour, union, employment, casual, d...   
5  [gar, deforestation, linked, jaya, chain, ran,...   

                                 Representative_Docs  
0  [conducted land clearing outside boundaries ha...  
1  [mighty earth rapid response 17 indications de...  
2  [may june 2021 eyes forest eof published title...  
3  [syno

## Raw Output

In [16]:
# Get the topic information DataFrame
topic_info_df = bertopic_model.get_topic_info()

# Iterate through the 'Representation' column and print each list of words with their weights
for index, representation in topic_info_df['Representation'].items():
    topic_id = topic_info_df.loc[index, 'Topic'] # Get the corresponding Topic ID
    # Get topic words with weights using get_topic
    topic_words_with_weights = bertopic_model.get_topic(topic_id)
    # Round the weights to 2 decimal places
    rounded_topic_words = [(word, round(weight, 2)) for word, weight in topic_words_with_weights]
    print(f"Topic {topic_id} Representation: {rounded_topic_words}")

Topic 0 Representation: [('land', 0.11), ('allegations', 0.03), ('plasma', 0.03), ('local', 0.03), ('rights', 0.03), ('communities', 0.03), ('compensation', 0.02), ('village', 0.02), ('dispute', 0.02), ('process', 0.02)]
Topic 1 Representation: [('earth', 0.1), ('mighty', 0.1), ('ha', 0.1), ('rapid', 0.1), ('response', 0.09), ('concession', 0.09), ('deforestation', 0.08), ('indications', 0.07), ('forest', 0.06), ('cleared', 0.06)]
Topic 2 Representation: [('eof', 0.06), ('forest', 0.05), ('mill', 0.04), ('gar', 0.04), ('illegal', 0.04), ('plantation', 0.04), ('plantations', 0.04), ('bukit', 0.03), ('ffb', 0.03), ('supplier', 0.03)]
Topic 3 Representation: [('hcv', 0.03), ('land', 0.03), ('river', 0.02), ('npp', 0.02), ('forest', 0.02), ('new', 0.02), ('communities', 0.02), ('planting', 0.02), ('principle', 0.02), ('complaint', 0.02)]
Topic 4 Representation: [('workers', 0.17), ('labour', 0.08), ('union', 0.07), ('employment', 0.07), ('casual', 0.06), ('daily', 0.06), ('rights', 0.05), 

In [17]:
## bertopic.py ##

# Returns { topic_number: List[str] , ...}
rep_docs = bertopic_model.get_representative_docs()

rep_docs_df = pd.DataFrame.from_dict(rep_docs)
rep_docs_df.to_csv(
  f"../data/embeddings/representative_docs_{formatted_datetime}.csv",
  index=False
)

# Visualizations

In [18]:
model_name = "LLM-BERTopic"
import seaborn as sns

sns.set_palette('magma')

## Create Dataframe with Results

In [24]:
import re
import pandas as pd

def format_bertopic_sentences(model, embeddings, n_topics):
    rows = []

    for index, row in embeddings.iterrows():
        doc_num = index
        dom_topic_num = row['topic']
        dom_topic_prob = round(row['probs'], 2)
        text = row['flt_tokens']

        # Topic contribution breakdown
        topic_dist = {}
        for n in range(n_topics):
            topic_dist[f'Topic_{n}'] = row[f'Topic_{n}']

        topic_binary = {f"Topic_{i}": 0 for i in range(n_topics)}

        shared_perc = 0.0
        sorted_doc = sorted(topic_dist.items(), key=lambda x: abs(x[1]), reverse=True)

        is_outlier = False
        itr = 0

        # Use the membership probabilities from the row if available
        for i in range(n_topics):
            topic_dist[f"Topic_{i}_Perc"] = round(row[f'Topic_{i}'], 2)

            while (shared_perc < 0.7):
                # Get Topic Number and Topic Percentage
                topic_num, topic_perc = sorted_doc[itr]
                topic_num = int(re.search(r'\d+', topic_num).group()) # Use REGEX to grab num
                shared_perc += topic_perc
                topic_binary[f"Topic_{topic_num}"] = 1  # Multi-Label Binary Classifcation
                itr += 1

            # Check if Outlier
            if (itr) > (number_of_topics/2):
                is_outlier = True

        # Keywords
        keywords = ", ".join([word for word, _ in model.get_topic(row['bertopic_topic'])])


        # Construct the document row
        row_data = {
            'ID': doc_num,
            'Dominant_Topic': dom_topic_num,
            'Summary': text,
        }
        row_data.update(topic_dist)
        row_data.update(topic_binary)
        rows.append(row_data)

    # Create DataFrame
    sent_topics_df = pd.DataFrame(rows)
    return sent_topics_df

In [25]:
# Assuming membership_df is available from the previous cell
# Get the number of topics from the membership_df columns (excluding original columns)
number_of_topics = len(membership_df.columns)


dominant_topic_df = format_bertopic_sentences(model=bertopic_model, embeddings=df_embeddings, n_topics=number_of_topics)
dominant_topic_df

Unnamed: 0,ID,Dominant_Topic,Summary,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_0_Perc,Topic_1_Perc,Topic_2_Perc,Topic_3_Perc,Topic_4_Perc,Topic_5_Perc
0,0,4,transferred different department purportedly p...,0,0,0,0,1,0,0.02,0.00,0.01,0.01,0.95,0.01
1,1,4,unjustifiably dismissed workers deceiving resi...,0,0,0,0,1,0,0.00,0.00,0.00,0.00,0.99,0.00
2,2,0,1 owns operates plantations conflicted contest...,1,0,1,0,0,0,0.58,0.01,0.28,0.03,0.04,0.06
3,3,2,corporate social responsibility csr program co...,0,0,1,0,0,1,0.09,0.02,0.54,0.05,0.02,0.28
4,4,2,breached obligations ignoring high conservatio...,1,0,1,0,0,0,0.26,0.02,0.55,0.04,0.03,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,382,0,rights violations land conflict sandabi indah ...,1,0,0,0,1,0,0.69,0.02,0.09,0.03,0.14,0.03
383,383,2,rimba harapan saksti rhs sarana titian permata...,0,0,1,0,0,1,0.06,0.02,0.66,0.08,0.01,0.16
384,384,1,construction canal prepare deep forested peatl...,0,1,0,1,0,0,0.03,0.60,0.05,0.26,0.01,0.06
385,385,1,clearance potential high carbon stock hcs fore...,0,1,0,1,0,0,0.04,0.43,0.08,0.32,0.02,0.11


### Saving to CSV

In [26]:
# Convert to CSV and save
dominant_topic_df.to_csv("presentation.csv",index = False)
from google.colab import files
files.download("presentation.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Exploration

In [22]:
dominant_topic_df[dominant_topic_df['Document_Num'] == 85]

KeyError: 'Document_Num'

## Wordcloud

In [None]:
def display_wordcloud(df, title, n_topics):
    n_cols = 3
    n_rows = int(np.ceil(n_topics / n_cols))
    fig = plt.figure(figsize=(12, 4 * n_rows), dpi=300, facecolor='white')  # Set background to white

    topics = ['Failed Compensation\nand Land Conflicts',
              'Environmental\nImpact',
              'Administrative\nViolations',
              'Deforestation\n(Supply Chains)',
              'Labor and Human\nRights Violations',
              'Illegal Fruit Bunches']

    for topic in range(n_topics):
        topic_df = dominant_topic_df[dominant_topic_df['Dominant_Topic'] == topic]
        text = ' '.join(topic_df['Text'].astype(str).tolist())
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower()
        en_stop = set(stopwords.words('english') + list(words_to_filter))
        text = ' '.join(word for word in text.split() if word not in en_stop)

        wordcloud = WordCloud(width=800, height=500, background_color='white', colormap='magma').generate(text)

        ax = fig.add_subplot(n_rows, n_cols, topic + 1)
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(topics[topic], pad=20)
        ax.axis("off")

    plt.tight_layout(rect=[0, 0, 0.9, 0.95])
    plt.show()

# Call the function without a title
display_wordcloud(df=dominant_topic_df, title='', n_topics=6)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# Input: Receives dataframe containing all rows with classification, table title, and number of topics
# Output: Displays n word clouds
def display_wordcloud(df, title, n_topics):

  n_cols = 2
  n_rows = int(np.ceil(n_topics / n_cols))
  fig = plt.figure(figsize=(8, 4 * n_rows))

  for topic in range(n_topics):
    topic_df = dominant_topic_df[dominant_topic_df['Dominant_Topic'] == topic]
    text = ' '.join(topic_df['Text'].astype(str).tolist()) # Joining the words of each text of each row into a list separated by ' '
    text = re.sub(r'[^A-Za-z\s]', '', text) # Substituting/deleting anything that IS NOT a letter Aa-Zz with ''
    text = text.lower()
    en_stop = set(stopwords.words('english') + list(words_to_filter))
    text = ' '.join(word for word in text.split() if word not in en_stop)

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    ax = fig.add_subplot(n_rows, n_cols, topic + 1)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(f"Topic #{topic}")
    ax.axis("off")

  fig.suptitle(title, fontsize=32, y=0.9)
  plt.tight_layout(rect=[0, 0, 1, 0.95])
  plt.show()


display_wordcloud(df=dominant_topic_df, title=f'{model_name} - Wordcloud', n_topics=6)

## Dominant Topic Frequency

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns

plt.style.use('fivethirtyeight')

# Prepare data
topics_df = dominant_topic_df.groupby(['Dominant_Topic'])['Document_Num'].count().reset_index()
indexes = topics_df['Dominant_Topic']
values = topics_df['Document_Num']

# Normalize values and get flare colormap from seaborn
norm = plt.Normalize(min(values), max(values))
flare = sns.color_palette("flare", as_cmap=True)
colors = flare(norm(values))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(indexes, values, color=colors)
ax.bar_label(bars)

# Labels and title
fig.suptitle(f'Dominant Topic Frequency', fontsize=35) # Changed fontsize here
ax.set_xlabel('Topic')
ax.set_ylabel('Frequency')
plt.show()

## Topic Distribution per Doc

In [None]:
def topic_dist_doc(df, n_topics, doc_num):
    import matplotlib.pyplot as plt

    topics = []
    values = []

    doc_row = df[df['Document_Num'] == doc_num]

    for i in range(n_topics):
        col_name = f"Topic_{i}"
        val = doc_row[col_name].values[0] if not doc_row.empty and pd.notnull(doc_row[col_name].values[0]) else 0.0
        topics.append(col_name)
        values.append(float(val))  # Ensure it's a float

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.bar(topics, values, color='red')
    plt.xlabel('Topics')
    plt.ylabel('Topic Contribution')
    plt.title(f'{model_name} - Topic Distribution for Document {doc_num}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
doc_number = 431    # Change This
topic_dist_doc(dominant_topic_df, number_of_topics, doc_num=doc_number)

## Overall Dominant Topic % Share

In [None]:
# Ensure dominant_topic_df is available
if 'dominant_topic_df' not in locals():
    print("Error: 'dominant_topic_df' not found. Please run the cell that creates dominant_topic_df first.")
else:
    plt.figure(figsize=(12, 6))  # Make the figure wider
    # Plot a histogram of the 'Topic_%_Contrib' column from dominant_topic_df
    dominant_topic_df['Topic_%_Contrib'].plot(kind='hist', bins=20, title='Distribution of Dominant Topic\nContribution Across Documents', color="#371a3d", edgecolor='white', linewidth=2)
    plt.gca().spines[['top', 'right',]].set_visible(False)

    # Add Mean line
    mean_value = dominant_topic_df['Dom. Topic Contribution'].mean()
    plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')

    plt.title(label='Distribution of Dominant Topic\nContribution Across Documents', fontsize=35)
    plt.xlabel('Dom. Topic Contribution', fontsize=20)
    plt.ylabel('Frequency', fontsize=20)
    plt.legend() # Add legend to display the label
    plt.show() # Add plt.show() to display the plot

# Scoring Methods

## Topic Coherence Score

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

texts = [doc.split() for doc in df_embeddings['flt_tokens']]
dictionary = Dictionary(texts)

topic_words = [[word for word, _ in bertopic_model.get_topic(i)] for i in range(len(bertopic_model.get_topics())) if i != -1]

coherence_model = CoherenceModel(
    topics=topic_words,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print("Topic Coherence:", coherence_score)


## Topic Diversity

In [None]:
topic_words = [bertopic_model.get_topic(i) for i in range(len(bertopic_model.get_topics())) if i != -1]
top_words = [word for topic in topic_words for word, _ in topic[:10]]
unique_words = len(set(top_words))
total_words = len(top_words)
diversity = unique_words / total_words
print("Topic Diversity:", diversity)

## Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

# Use embeddings and predicted topics (excluding -1)
mask = df_embeddings['topic'] != -1
score = silhouette_score(
    np.vstack(df_embeddings.loc[mask, 'embedding']),
    df_embeddings.loc[mask, 'topic']
)
print("Silhouette Score:", score)

## Davies-Bouldin Index

In [None]:
from sklearn.metrics import davies_bouldin_score

score = davies_bouldin_score(
    np.vstack(df_embeddings.loc[mask, 'embedding']),
    df_embeddings.loc[mask, 'topic']
)
print("Davies-Bouldin Index:", score)

In [None]:
dominant_topic_df = format_bertopic_sentences(model=bertopic_model, embeddings=df_embeddings, n_topics=number_of_topics)
display_wordcloud(df=dominant_topic_df, title='Topic Classification Wordclouds (Updated)', n_topics=number_of_topics)

In [None]:
bertopic_model.visualize_hierarchy()

In [None]:
hierarchical_topics = bertopic_model.hierarchical_topics(docs)
bertopic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
bertopic_model.visualize_barchart(top_n_topics=20, n_words=8, height=400, width=600)