In [9]:
!module load cuda11.8
import gensim
from top2vec import Top2Vec
from gensim.models.coherencemodel import CoherenceModel
import os
import pandas as pd


In [12]:
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/"
DATA_PATH = os.path.join(PATH,'data')
# model names
models = [
    "top2vec_learn_doc2vec_2024-06-20.model",
    "top2vec_deeplearn_universal_notoken_2024-06-25.model",
    "top2vec_deeplearn_universal_2024-06-25.model",
    "top2vec_deeplearn_universal_2024-06-20.model",
    "top2vec_deeplearn_doc2vec_notoken_2024-06-21.model"
]

for model_name in models:
    model_path = os.path.join(DATA_PATH,model_name)
    model = Top2Vec.load(model_path)
    print(model_name)
    try:
        print(model.document_vectors.shape)
    except AttributeError:
        print("No document vectors")

top2vec_learn_doc2vec_2024-06-20.model
No document vectors
top2vec_deeplearn_universal_notoken_2024-06-25.model
(820099, 512)
top2vec_deeplearn_universal_2024-06-25.model
(820099, 512)
top2vec_deeplearn_universal_2024-06-20.model
(820099, 512)
top2vec_deeplearn_doc2vec_notoken_2024-06-21.model
(820099, 300)


In [13]:
final_models= [ "top2vec_deeplearn_universal_notoken_2024-06-25.model",
                "top2vec_deeplearn_universal_2024-06-25.model",
                "top2vec_deeplearn_doc2vec_notoken_2024-06-21.model"]

In [66]:
model = Top2Vec.load(os.path.join(DATA_PATH,final_models[2]))

In [36]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f"Number of topics: {len(topic_sizes)}")

Number of topics: 5390


In [37]:
model.hierarchical_topic_reduction(5091, interval=10) # super slow

KeyboardInterrupt: 

In [67]:
docs, _1, _2 = model.search_documents_by_documents(doc_ids=[810944], num_docs=10)
print(docs)

['iphone x review everything you need to know about apple new ive been using the iphone x for a week and its definitely worth the  price tag topics are apple inc iphone'
 'apple is having a bad  it going to be a bad year for the iphone here why topics are apple inc apple iphone iphone s'
 'live from the iphone x and iphone  event check out everything apple just announced topics are iphone apple inc'
 'superchange iphone how to supercharge your iphone in  minutes topics are ipad apple inc iphone'
 'best features of apple new iphone x apple unveiled the iphone x here are the best features of the  phone topics are iphone apple inc'
 'the iphone  will be boring sorry apple fans the iphone  is going to be boring topics are apple inc samsung group iphone iphone '
 'what it like to use apple new iphone x handson with the new  apple iphone x topics are apple inc iphone'
 'first look at apple new iphone facial unlock feature apple got rid of the home button on the iphone x but you can use your 

In [68]:
docs, _1, ids = model.search_documents_by_keywords(keywords=["espresso"], num_docs=10)
print(docs)

['eat drink perth the aviary espresso martinis what better than a traditional espresso martini salted caramel and popcorn espresso martinis for only  make the most of eatdrinkperth and visit the aviary perth for their letsthursday special super happy hour with drink specials from pm pm and thursday tucker meals for just  the aviary is your perfect thursday destination'
 'kahlúa espresso martini the ultimate way to impress your guests at your next cocktail party see more recipes here khluausdrinks kahlúa espresso martini the ultimate way to impress your guests at your next cocktail party see more recipes here khluausdrinks topics are kahlúa espresso martini buzzfeed'
 'espresso martini layer cake what better than a sip of a cocktail a bite of a dessert that tastes like one get our espresso martini cake recipe topics are cake dessert espresso martini classic cocktail'
 'how to make a classic espresso martini for internationalcoffeeday jamieolcomespmartini drinkstube topics are espresso e

In [61]:
docs, scores, ids = model.query_documents(query="clean espresso machine", num_docs=10)
for i, doc in enumerate(docs):
    print(f"Document: {doc[:100]}... Score: {scores[i]}")

Document: portable espresso true coffee snobs would love this portable espresso machine topics are espresso co... Score: 0.5969740748405457
Document: portable espresso coffee machine we all know someone that always needs a coffee topics are coffee es... Score: 0.5849196910858154
Document: bripe the coffee brew pipe simple espresso everywhere... Score: 0.5835559964179993
Document:  coffee maker this goldplated  machine just makes coffee... Score: 0.5628326535224915
Document:  coffee maker this goldplated  machine just makes coffee... Score: 0.5628326535224915
Document: fresh espresso and luscious crema watch the barista express in action topics are breville espresso c... Score: 0.5587366223335266
Document: how to clean a coffee maker how to clean a coffee maker because it shouldnt be that color inside top... Score: 0.5566979050636292
Document: make espresso anywhere you can make espresso anytime anywhere amzntoaaloi topics are coffee espresso... Score: 0.5547324419021606
Document: lever

In [63]:
print(docs[5])

fresh espresso and luscious crema watch the barista express in action topics are breville espresso coffee


In [None]:
# clustering using umap and hdbscan
def clustering(model):
    model.hierarchical_topic_reduction(0.1)
    model.umap_topic_reduction(n_neighbors=15, min_dist=0.1)
    model.hdbscan_topic_reduction(min_cluster_size=15)
    return model