# Installation and Import
--------------------------------------------------------------------

In [None]:
!pip install bertopic
!pip install --upgrade pandas==1.3.4
!pip install ipywidgets
!pip install nltk

In [None]:
# !pip install gensim
# #!pip install scikit-learn==0.22.1
# #!pip install matplotlib
# !pip install torchvision 
# !pip install bertopic
# !pip torch
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import pandas as pd
import pickle
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from tqdm import tqdm as tqdm
from itertools import product
from bertopic import BERTopic

# Helper functions for converting csv to pkl

In [None]:
def convert_csv_to_pkl(subreddit_csv):
    path_to_pkl='/home/roikreme/BertTopic/{}/{}_main_data.pkl'.format(subreddit,subreddit)
    print("start reading csv file")
    data=pd.read_csv(subreddit_csv)
    print('finish to read csv file and start to convert to pkl')
    data.to_pickle(path_to_pkl,protocol=4)

# subreddit='wallstreetbets'
# path='/home/roikreme/BertTopic/{}/{}_main_data.csv'.format(subreddit,subreddit)
# convert_csv_to_pkl(path)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import nltk
import string
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

def get_topic_model(documents, n_neighbors, min_topic_size,calculate_probabilities = True):
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=10, min_dist=0.0, metric='cosine')

    vectorizer_model = CountVectorizer(stop_words="english", min_df=20)

    topic_model = BERTopic(umap_model=umap_model,vectorizer_model=vectorizer_model, calculate_probabilities=calculate_probabilities, verbose=True,min_topic_size=min_topic_size, nr_topics="auto")
    topic_model.fit(documents)

    return topic_model

In [None]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel


def get_coherence(df, topic_model):
    documents_per_topic = df.groupby(['Topic'], as_index=False).agg({'title_selftext': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.title_selftext.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words=[]
    for t in range(len(set(topics))-2):
        t_w=[]
        topic=topic_model.get_topic(t)
        if not isinstance(topic,bool):
            for words in topic:
                if words[0] not in tokens[0]:continue
                t_w.append(words[0])
            topic_words.append(t_w)
    
    # Evaluate
    coherence_model = CoherenceModel(topics=list(topic_words), 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()

    return coherence



Load cleaning Data

In [None]:
subreddit="antiwork"
with open('/home/roikreme/BertTopic/{}/model/{}_clean_df.pickle'.format(subreddit,subreddit), "rb") as f:
    df = pickle.load(f)

df

In [None]:
df.replace("", float("NaN"), inplace=True)
df.dropna(subset = ["title_selftext"], inplace=True)
df

In [None]:
random.seed(1)
content=df['title_selftext'].to_list()
docs_for_transform = random.sample(content,141130 )


# If the specific subreddit contains only title
-------------------------------------------------------------------

In [None]:
random.seed(1)
docs_for_train = random.sample(content,141130)

# random 80000 docs 5 times 
------------------------------------------------------------

In [None]:
random_list=[]
for i in tqdm(range(1,6)):
    random.seed(1)
    docs = random.sample(document,80000)
    model=get_topic_model(docs, 20, 50)
    topics, probas = model.transform(list(df['title_selftext']))
    df["Topic"] = topics
    tmp = df[['post_id',"Topic",'title_selftext']]
    tmp["ID"] =  range(len(df))
    insert_topic_word(tmp)
    coh = get_coherence(tmp, model)
    random_list.append({"random_num":i,"coherenc": coh})
    pd.DataFrame(random_list).to_csv("/home/roikreme/BertTopic/random/random{}_coh.csv".format(i))

In [None]:
pd.DataFrame(random_list).to_csv("/home/roikreme/BertTopic/random/final_random.csv")

# optimization of the model
------------------------------------------------------

In [None]:
#new_topics, new_probs = topic_model.reduce_topics(docs, topics, probabilities=probas)
n_neighbors = [15,20,25, 30, 35,40]
min_topic_sizes = [50, 100 ,150, 200, 250,300]



insert to each record in the dataframe the topic words


In [None]:
res = []
d = {'Number Of Negihbor':[],'Min Topic Size':[],'Num of Topic':[],'coherence':[],'Quantity of topic -1':[],'topic -1 %':[],'Total Amount':[]}
res_tabel = pd.DataFrame(data=d,index=[])
res_tabel


Read the optimization tabel

In [None]:
with open("/home/roikreme/BertTopic/{}/random optimization/all/new_data2/final_tabel_df.csv".format(subreddit), "rb") as f:
    res_tabel = pd.read_csv(f)

res_tabel

In [None]:
temp2=[50,100,150,200,250,300]
temp1=[15,20]


In [None]:
import os
torch.cuda.empty_cache()
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
model = get_topic_model(docs_for_train, 20, 300)



optimization of bert topic


In [None]:
torch.cuda.empty_cache()
from collections import Counter
index_save=0
for n_neighbor, min_topic_size  in tqdm(product(n_neighbors, min_topic_sizes), total=36):
    #if n_neighbors==20:continue
   
    # save the model of this iteration
    model = get_topic_model(docs_for_train, n_neighbor, min_topic_size)
    model.save("/home/roikreme/BertTopic/{}/random optimization/all/models/my-model_{}_{}_{}".format(subreddit,n_neighbor,min_topic_size,subreddit))
    print("finish model")

    topics, probas = model.transform(list(df['title_selftext']))
    df['Topic']=topics
    print("finish transform")

    #Saving a data for this divition of topics
    path_to_save='/home/roikreme/BertTopic/{}/random optimization/{}/new_data2/df_80k_trainer/{}_{}.pickle'.format(subreddit,status,n_neighbor,min_topic_size)
    df.to_pickle(path_to_save,protocol=4)

    #Calaulate the amount of each topic
    get_topic=model.get_topic_info()
    c = Counter(topics)
    soret_c=sorted(c.items(),key=lambda x:x[0])
    count=[i[1] for i in soret_c]

    #update the dataframe with the ammount of each topic after transform
    get_topic['Count']=count

    sum_ = sum(count)
    get_topic['percentage']=get_topic['Count'].apply(lambda x: str(round((x/sum_)*100,2))+'%')

    print("start coh")
    coh = get_coherence(df, model)

    #Adding to the dataframe
    res_tabel.loc[len(res_tabel.index)]=[str(n_neighbor),str(min_topic_size),str(len(get_topic)),coh,str(count[0]),get_topic['percentage'][0],str(sum_)]


    #saveing in any 2 iteration
    if index_save % 2==0:
        print("save")
        res_tabel.to_csv('/home/roikreme/BertTopic/{}/random optimization/{}/new_data2/df_80k_trainer/{}_{}.pickle'.format(subreddit,status,n_neighbor,min_topic_size))
    index_save+=1
   
    
    print("coh is:{}, min_topic_size:{}, n_neighbor:{} ".format(coh,min_topic_size,n_neighbor))
    res.append({"coherenc": coh, "min_topic_size": min_topic_size, "n_neighbor":n_neighbor})
    #get_topic.to_csv("/home/roikreme/BertTopic/{}/random optimization/new_data/{}_{}_{}.csv".format(subreddit,n_neighbor,min_topic_size,coh),index=False)
   # pd.DataFrame(res).to_csv("/home/roikreme/BertTopic/random/{}/new_data/final_coh.csv".format(subreddit))
    



In [None]:
# save the last version of table - sorted
res_tabel.sort_values(by='Num of Topic',inplace=True)
res_tabel.to_csv("/home/roikreme/BertTopic/{}/random optimization/new_data/final_tabel_df.csv".format(subreddit))


save the results of coherence in csv file

In [None]:
pd.DataFrame(res).to_csv("/home/roikreme/BertTopic/random/{}/final_coh.csv".format(subreddit))

In [None]:
res_tabel

# create a plot for optimization - coherence
--------------------------------------------------------------------------------

In [None]:
x=res_tabel['Num of Topic'].to_list()
y=res_tabel['coherence'].to_list()
plt.plot(x,y)
plt.xlabel('Num of Topic')
plt.ylabel('coherence')
plt.title('coherence graph')

In [None]:
# import matplotlib.pyplot as plt
# n=[15,20,25]
# m_topic=[50,100,150,200,250,300]
# neg_topic=[(neg,topic) for neg,topic in product(n, m_topic)]

# neg_topic=[str(r) for r in neg_topic]
# score=[round(r['coherenc'],3) for r in res]

# fig, axs = plt.subplots(1, figsize=(25, 10), sharey=True)

# axs.set_xlabel("(n_neighbors, min_topic_sizes)")
# axs.set_ylabel("coherence")

# ymax=max(score)
# xpos=score.index(ymax)
# xmax=neg_topic[xpos]
# axs.annotate("Max = {}".format(ymax),xy=(xmax,ymax),xytext=(xmax,ymax),arrowprops=dict(facecolor='black'))
# axs.bar(neg_topic, score,width=0.5,align='center')
# plt.show()

get the maximum coherence

In [None]:
coh = pd.DataFrame(res)
coh.loc[coh["coherenc"].argmax()]

# Build the optimize model
---------------------------------------------------------------------

In [None]:
# parameters of the optimizing model
n_neighbor=15
min_topic_size=150 

# create the model
model = get_topic_model(docs_for_train, n_neighbor, min_topic_size)


In [None]:
torch.cuda.empty_cache()
len(df)

save the model

In [None]:
#model=BERTopic.load("/home/roikreme/BertTopic/{}/model/all/my-model_{}_{}_{}".format(subreddit,n_neighbor,min_topic_size,subreddit))

In [None]:
model.save("/home/roikreme/BertTopic/{}/model/all/my-model_{}_{}_{}".format(subreddit,n_neighbor,min_topic_size,subreddit))

In [None]:

topics, probas = model.transform(df['title_selftext'].to_list())


Add a columns of probability and topics

In [None]:
df["Topic"] = topics
prob=[round(p[np.argmax(p)],4) for p in probas]
df["Topic"] =topics
df['probas']=prob
df
# tmp = df[['post_id','status',"title","Topic",'title_selftext']]
# tmp["ID"] =  range(len(df))


Mergine between the output of berttopic (get_topic_info) to main df


In [None]:
gettopic=model.get_topic_info()
df=df.merge(gettopic,left_on='Topic',right_on='Topic')
df['ID']=len(df)
#df['cont/len']=df['Count'].div(len(df))
df

insert to each record in the dataframe the topic words


In [None]:
def insert_topic_word(dff):
    topics=set(dff['Topic'].to_list())
    size=len(set(topics))
    for t in tqdm(range(-1,size-1)):
        t_w=set()
        topic=model.get_topic(t)
        if not isinstance(topic,bool) and str(topic)!='NaN':
            for words in topic:
                t_w.add(words[0])
            dff.loc[dff.Topic == t, "topic_words"] = ', '.join(t_w)
            
# insert_topic_word(df)
# df

In [None]:

insert_topic_word(df)
df

save the data as a pkl file

In [None]:
path_to_save='/home/roikreme/BertTopic/{}/model/all/{}_model_data.pickle'.format(subreddit,subreddit)
df.to_pickle(path_to_save,protocol=4)

In [None]:
import sklearn
subreddit="politics"
model = BERTopic.load("/home/roikreme/BertTopic/{}/model/all/my-model_{}_15_300".format(subreddit,subreddit))

In [None]:
model.get_topic_info()

----------------------------------------------------------------------

In [None]:
new_df.iloc[19459].title
#model.get_topic(-1)

In [None]:
coh = get_coherence(new_df, model)
coh

In [None]:
m=model.get_topic_info()


In [None]:
m.merge(df,left_on='Topic',right_on='Topic')