## Topic Modeling

In [25]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import openai
from bertopic.representation import OpenAI
import tiktoken
import polars.selectors as cs
import polars as pl


In [26]:
def extract_list_of_company_news(df, company):
    return (
        df
        .filter(
            pl.col("companies").list.contains(company)
        )
        .select(
            pl.col("text")
        )
        .to_series()
        .to_list()
    )

In [27]:
import os

def extract_topics(documents):

    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    prompt = """
    I have a topic that contains the following documents: 
    [DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Based on the information above, extract a short topic label in the following format:
    topic: <topic label>
    """

    # Tokenizer
    tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Fine-tune your topic representations
    main_representation = KeyBERTInspired() #main_representation = KeyBERTInspired()
    aspect_model1 = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True, prompt=prompt, diversity=0.1, doc_length=100, tokenizer=tokenizer)
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

    #Add all models together to be run in a single `fit`
    representation_model = {
    "Main": main_representation,
    "Aspect1":  aspect_model1
    }

    topic_model = BERTopic(
        embedding_model=embedding_model,
        representation_model=representation_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=20
        )

    try:
        topics, probs = topic_model.fit_transform(documents)
        topic_distr, _ = topic_model.approximate_distribution(documents)
    except Exception as e:
        return None, None


    topic_labels = topic_model.generate_topic_labels(topic_prefix=False,
                                                  aspect="Aspect1")
    topic_model.set_topic_labels(topic_labels)

    return topic_model, topic_distr
 

In [28]:
def df_with_topics(df, company, topic_model, topic_distr):
    return (
        df
        .filter(
            pl.col("companies").list.contains(company)
        )
        .with_row_index(name="index", offset=0)
        .with_columns(
            companies = pl.lit(company)
        )
        .join(
            (
            pl.DataFrame(topic_distr)
            .with_row_index(name="index", offset=0)
            .melt(
                id_vars="index",
                value_vars= cs.starts_with("column")
                ) 
            .sort("index")
            .filter(
                pl.col("value")>.20
            )
            .with_columns(
                topics  = pl.col("variable").str.extract(r"(\d)").cast(pl.Int64)
            )
            .drop("variable")
            ),
            on="index",
            how="left",
            validate='1:m',
            coalesce=True
        )
        .join(
            (
                pl.DataFrame(topic_model.get_topic_info())
                .select(
                    pl.col(["Topic", "Count", "Representation", "CustomName"]) 
                )
                .select(
                    pl.all().name.prefix("topics_")
                )
            ),
        how="left",
        left_on = "topics",
        right_on = "topics_Topic",
        coalesce=True
        )
        .group_by(
            ["index","title","link", "text", "date_published", "companies"],
            maintain_order=True
        )
        .all()
        .drop("index")
        .rename(lambda column_name : column_name.lower())
        .rename({"value":"topic_probability_distribution", "topics_customname": "topics_custom_name"})
    )

In [29]:
def save_model(topic_model, company):
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    path = "./Models/BERTopic_Models/" + company
    topic_model.save(path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [30]:
path = "./Data/News/news_cleaned.parquet"
# Companies we're interested in
companies = ["Berkshire Hathaway","JPMorgan","Bank of America",
             "Wells Fargo","CVS Health","UnitedHealth","McKesson",
             "AmerisourceBergen","Walmart","Costco",
             "Kroger","Home Depot","General Motors",
             "Boeing","Caterpillar","Ford"]

df_news = pl.read_parquet(path)


In [31]:
schema = {
        'title':pl.Utf8,
        'link': pl.Utf8,
        'text': pl.Utf8,
        'date_published': pl.Date,
        'companies': pl.Utf8,
        'topic_probability_distribution': pl.List(pl.Float64),
        'topics': pl.List(pl.Int64),
        'topics_count': pl.List(pl.Int64),
        'topics_representation': pl.List(pl.List(pl.Utf8)),
        'topics_custom_name': pl.List(pl.Utf8)
    }
global_df_with_topics = pl.DataFrame(schema=schema)

for company in companies:
    news = extract_list_of_company_news(df_news, company)
    print(company)
    topic_model, topic_distr = extract_topics(news)
    if topic_model is None:
        print(f"Error while processing {company}. The number of news may be too small ({len(news)}).")
        continue
    partial_df_with_topics = df_with_topics(df_news, company, topic_model, topic_distr)
    global_df_with_topics = pl.concat(
        [global_df_with_topics, partial_df_with_topics],
        how="vertical"
    )
    save_model(topic_model,company)

Berkshire Hathaway
JPMorgan
Bank of America
Wells Fargo
CVS Health
UnitedHealth
McKesson
Error while processing McKesson. The number of news may be too small (9).
AmerisourceBergen


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


Error while processing AmerisourceBergen. The number of news may be too small (4).
Walmart
Costco
Kroger


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


Error while processing Kroger. The number of news may be too small (6).
Home Depot
General Motors
Boeing
Caterpillar
Ford


In [32]:
global_df_with_topics.write_parquet("./Data/News/news_with_topics.parquet")

In [None]:
loaded_model = BERTopic.load("./Models/BERTopic_Models/Walmart")

In [220]:
topic_model.visualize_barchart(top_n_topics=50)

In [217]:
topic_model.custom_labels_