In [None]:

import os
import backoff

from bertopic import BERTopic

import openai

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

## Load BERTopic Model
- `fit_bertopic_model` in the `models` directory

In [None]:
model = 'fit_bertopic_model'
topic_model = BERTopic.load(
    os.path.join(
        '..',
        'data',
        'models',
        model
    )
)

In [None]:
# load embeddings:
text_embedding_data = os.path.join(
    '..',
    'data',
    'embeddings',
    'full_id_to_embeddings.jsonl'
)

df = pd.DataFrame()

read_lines = 0
for chunk in tqdm(pd.read_json(text_embedding_data, lines=True, chunksize=5000)):
   #print(f'Reading {len(chunk)} lines (total read so far: {read_lines})')
        
    df = pd.concat([df, chunk])
    read_lines += len(chunk)
    del chunk
    
df.sort_values('full_id', inplace=True)

In [None]:
df.sort_values('full_id', inplace=True)

In [None]:
df.head()

In [None]:
text_reps = pd.read_csv(
    os.path.join(
        '..',
        'data',
        'training',
        'text_representations.csv'
)
)

text_reps = dict(zip(text_reps['full_id'], text_reps['text_representation']))

assert sorted(text_reps.keys()) == df['full_id'].tolist()

df['text_representation'] = df['full_id'].map(text_reps)


In [None]:
df

## Look at Topics

In [None]:
topics = topic_model.topics_
noise = sum([1 for i in topics if i == -1])
noise_prop = noise / len(topics)
print(f'Noise %: {round(noise_prop*100, 2)}')

In [None]:
topic_reps = dict(
    zip(
        topic_model.get_topic_info()['Topic'],
        topic_model.get_topic_info()['Representation']
    )
)

topic_reps = {k: ','.join(v) for k,v in topic_reps.items()}

for topic,words in topic_reps.items():
    if topic != -1:
        print(f'Topic {topic}: {words}')

## Map Topics to Reddit Posts
- Pull original topic labels from topic model
- Assign to a column in dataframe

In [None]:
df['og_topic'] = topic_model.topics_

In [None]:
df.head(20)

## Dealing With Outliers
- Create topic vectors.
- Assign noise to most similar topic vector.
- Assign **any** document that has cosine similarity to its assigned topic's topic vector that is below `0` to noise (`-1`).
- For documents that were originally assigned `-1`:
  - If their cosine similarity to their most similar topic vector is >= `0.50`, reassign to the topic.
  - Else, keep as noise.


In [None]:
topics = set(topic_model.get_topic_info()['Topic'])
topics.remove(-1)

topic_vectors = {}
for topic in tqdm(topics):
    topic_vectors[topic] = np.vstack(df.loc[df['og_topic']==topic]['embeddings']).mean(axis=0).reshape(1,-1)
    
for topic in tqdm(topics):
    idxs = df.loc[df['og_topic']==topic].index
    topic_sims = cosine_similarity(
        np.vstack(df.loc[idxs]['embeddings']),
        topic_vectors[topic]
    )
    df.loc[idxs, 'topic_sim'] = topic_sims
    
# get noise most sim topic:
df['new_topic'] = df['og_topic']
idxs = df.loc[df['og_topic']==-1].index

topic_sims = cosine_similarity(
    np.vstack(df.loc[idxs]['embeddings']),
    np.vstack(list(topic_vectors.values()))
)

topic_ids = [np.argmax(arr) for arr in topic_sims]
cosine_sims = [arr[np.argmax(arr)] for arr in topic_sims]

df.loc[idxs, 'new_topic'] = topic_ids
df.loc[idxs, 'topic_sim'] = cosine_sims

In [None]:
plt.figure(figsize=(10,5))
sns.set_style('whitegrid')
plt.title('Distribution of Document Embedding-Topic Vector Cosine Similarities', size=11, weight='bold', fontfamily='Arial', pad=10)
flierprops = dict(marker='x', markersize=1, alpha=0.1, markeredgecolor='#1c1c1c')
sns.boxplot(data=df, x='topic_sim', flierprops=flierprops, boxprops={"facecolor": (.4, .6, .8, .5)})
plt.xticks(list(np.arange(-.10,1.0,.05))+[1.0], size=9)
plt.xlabel('Cosine Similarity', size=10, fontfamily='Arial', labelpad=10)
plt.show()

In [None]:
df['topic_sim'].describe()

In [None]:
# any sim below 0, assign as noise:
df.loc[df['topic_sim'] < 0, 'new_topic'] = -1

# only keep noise reassigned topics above 0.50 similarity:
sim_threshold = 0.50
df.loc[(df['og_topic'] == -1) & (df['topic_sim'] < 0.50), 'new_topic'] = -1

final_noise = sum([1 for i in df['new_topic'].tolist() if i == -1]) / len(df)
print(f'Final noise: {round(final_noise * 100, 2)}%')

In [None]:
df['new_topic'].value_counts(normalize=True)

In [None]:
df['new_topic'].value_counts(normalize=False)

## Update BERTopic Model
- ***WARNING:*** Updating the following attributes **will** lead to an overwriting of their values in the topic model object.

In [18]:
# update topic representations with new documents included:
new_topics = df['new_topic'].tolist()

cv_model = CountVectorizer(
    min_df=1,
    max_df=0.95,
    stop_words=list(stopwords.words('english')),
    ngram_range=(1,1)
)

topic_model.update_topics(
    df['text_representation'].tolist(), 
    vectorizer_model=cv_model,
    top_n_words=20,
    topics=new_topics
)

In [None]:
for topic,words in dict(zip(topic_model.get_topic_info()['Topic'], topic_model.get_topic_info()['Representation'])).items():
    if topic != -1:
        print(f'Topic {topic}: {", ".join(words)}')

In [None]:
final_noise = sum([1 for i in new_topics if i == -1]) / len(new_topics)
print(f'Final noise proportion: {round(final_noise*100, 4)}%')

## Save Updated Model
- Save to the `models` directory
  - **Name:** `updated_bertopic_model`
- Save updated topic data.

In [None]:
NEW_MODEL_OUT = 'updated_bertopic_model'
NEW_MODEL_OUT_PATH = os.path.join('..', 'data', 'topic_data', NEW_MODEL_OUT)
topic_model.save(NEW_MODEL_OUT_PATH, serialization='safetensors', save_ctfidf=True)

**Save updated data:**


In [None]:
df.drop(columns=['embeddings'], inplace=True)

df.set_index('full_id').to_csv(
    os.path.join(
        '..',
        'data',
        'topic_data',
        'topic_data.csv'
    )
)

`---Complete---`