<a href="https://colab.research.google.com/github/rupaut98/LLM_Oreilly/blob/main/ag_news_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("ag_news", split = 'train[:20%]')

In [None]:
dataset.info

In [None]:
texts = dataset['text']
labels = dataset['label']

In [None]:
print(labels)

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

In [None]:
embeddings = model.encode(texts, show_progress_bar = True)

In [None]:
!pip install umap-learn
from umap import UMAP
umap_model = UMAP(n_components = 5, min_dist = 0.0, metric = 'cosine', random_state = 42)

In [None]:
reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
!pip install hdbscan

In [None]:
from hdbscan import HDBSCAN
hdb_model = HDBSCAN(min_cluster_size = 50, metric = 'euclidean', cluster_selection_method = 'eom')

In [None]:
cluster_outlier = hdb_model.outlier_scores_

In [None]:
!pip install pandas
import pandas as pd
cluster_outlier = pd.DataFrame(cluster_outlier)

In [None]:
import numpy as np

for index in np.where(clusters == 1)[0][:5]:
  print(texts[index][:200] + "...\n")

In [None]:
reduced_embeddings = UMAP(n_components = 2, min_dist = 0.0, metric = 'cosine', random_state = 42).fit_transform(embeddings)

In [None]:
df = pd.DataFrame(reduced_embeddings, columns = ['x', 'y'])

In [None]:
df['labels'] = labels
df['cluster'] = [str(c) for c in clusters]

clusters_df = df.loc[df.cluster != '-1', :]
outliers_df = df.loc[df.cluster == '-1', :]


In [None]:
import matplotlib.pyplot as plt

plt.scatter(outliers_df.x, outliers_df.y, alpha = 0.05, s=2, c= 'grey')
plt.scatter(clusters_df.x, clusters_df.y, c = clusters_df.cluster.astype(int),alpha = 0.6, s=2, cmap="tab20b")
plt.axis("off")

In [None]:
!pip install bertopic

from bertopic import BERTopic
topic_model = BERTopic(
    embedding_model = model,
    umap_model = umap_model,
    hdbscan_model = hdb_model,
    verbose = True
).fit(texts)

In [None]:
topic_model.get_topic_info()

In [None]:
labels = [str(c) for c in labels]

In [None]:
fig = topic_model.visualize_documents(
    labels,
    reduced_embeddings = reduced_embeddings,
    width = 1200,
    hide_annotations = True
)

fig.update_layout(font=dict(size=16))

In [None]:
from copy import deepcopy
original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
result = original_topics[0]
print(result)

In [None]:
def topic_differences(model, original_topics, nr_topics = 5):
  df = pd.DataFrame(columns=['Topic', 'Original', 'Updated'])

  for topic in range(nr_topics):
    og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
    new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
    df.loc[len(df)] = [topic, og_words, new_words]

  return df

from bertopic.represenation import KeyBERTInspired

represenation_model = KeyBERTInspired()
topic_model.update_topics(texts, representation_model = represenation_model)

In [None]:
!pip install bertopic

In [None]:
from bertopic.representation import KeyBERTInspired

represenation_model = KeyBERTInspired()
topic_model.update_topics(texts, representation_model = represenation_model)

In [None]:
topic_differences(topic_model, original_topics)

In [None]:
from bertopic.representation import MaximalMarginalRelevance

represenation_model = MaximalMarginalRelevance(diversity = 0.2)
topic_model.update_topics(texts, representation_model = represenation_model)

topic_differences(topic_model, original_topics)

In [None]:
!pip install cohere
!pip install tiktoken
!pip uninstall typing-extensions -y quiet
!pip install typing-extensions==4.5.0 --quiet
!pip install --upgrade tensorflow-probability
!pip install --upgrade --quiet openai

import openai

In [None]:
import openai
from bertopic.representation import OpenAI

# Fine-tune topic representations with GPT
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <short topic label>
"""
client = openai.OpenAI(api_key="")
representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, exponential_backoff=True, prompt=prompt)
topic_model = BERTopic(representation_model=representation_model)
topic_model.update_topics(texts)

topic_differences(topic_model, original_topics)