In [None]:
import pandas as pd
import random
import logging
import openai
from bertopic import BERTopic
from bertopic.representation import OpenAI
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.decomposition import PCA
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("training-data.csv")
docs = list(df["Text"])
summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
In this topic, the following documents are a small but representative subset of all documents in the topic:
[DOCUMENTS]

Based on the information above, please give a sentence description of this topic in the following format:
<description>
"""
# Train BERTopic with a custom CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
# Create your representation model
client = openai.OpenAI(api_key="API_KEY")
representation_model = OpenAI(client, model="gpt-4-0125-preview", delay_in_seconds=2, chat=True, nr_docs=5, prompt=summarization_prompt)
topic_model = BERTopic(verbose=True, ctfidf_model=ctfidf_model, embedding_model="all-mpnet-base-v2", vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)
# Reduce and update topics
new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings")
topic_model.update_topics(docs, topics=new_topics, representation_model=representation_model)
# Create df and save to csv
df_topics = topic_model.get_document_info(docs)
df_topics["Sentiment"] = df["Sentiment"]
df_topics.to_csv("topic-data.csv", index=False)