In [4]:
import os
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import sys
import re
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# setting path to the parent directory of the file
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/"
DATA_PATH = os.path.join(PATH,'data')
data = pd.read_csv(os.path.join(DATA_PATH, "data_all_text.csv"), low_memory=False)
model_name = "top2vec_learn_doc2vec_2024-06-20.model"
model = Top2Vec.load(os.path.join(DATA_PATH, model_name))

## Topics

In [9]:
# Get the top topics for each document
topic_nums, topic_scores, topic_words, word_scores = model.get_documents_topics(doc_ids=model.document_ids)
print(topic_nums.shape)
print(topic_nums[:10])

(820099,)
[1343   45 8915 4429 1041 3890 2536  494 5323 2764]


In [13]:
print(f"Number of topics: {model.get_num_topics()}")

Number of topics: 9699


In [16]:
topics_words, word_scores, topic_nums = model.get_topics(30)
for i, topic_words in enumerate(topics_words):
    print(i)
    print(topic_words)

0
['rodham' 'hillary' 'clinton' 'democratic' 'bernie' 'republican' 'sanders'
 'presidential' 'delegate' 'election' 'nominating' 'dnc' 'mtpdaily'
 'donald' 'trump' 'gop' 'republicans' 'candidate' 'elect' 'democrats'
 'political' 'democrat' 'voter' 'convention' 'campaign' 'barack'
 'presidency' 'president' 'democracy' 'obama' 'rncincle' 'committee' 'rnc'
 'caucus' 'conservatism' 'electoral' 'turks' 'clintons' 'debate' 'endorse'
 'primary' 'supporter' 'kaine' 'rhetoric' 'politic' 'establishment' 'tyt'
 'msnbc' 'span' 'maddow']
1
['motogp' 'prix' 'marquez' 'rossi' 'motorcycle' 'valentino' 'aragongp'
 'qatargp' 'vinales' 'catalangp' 'videopass' 'spanishgp' 'austriangp'
 'japanesegp' 'dutchgp' 'motogpbuzz' 'argentinagp' 'malaysiangp' 'pedrosa'
 'germangp' 'iannone' 'czechgp' 'sanmarinogp' 'valenciagp' 'dovizioso'
 'overtake' 'americasgp' 'frenchgp' 'ducati' 'britishgp' 'italiangp'
 'sepang' 'motul' 'racing' 'crutchlow' 'lap' 'espargaro' 'australiangp'
 'zarco' 'yamaha' 'lorenzo' 'michelin' '

In [17]:
topics_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["stephen","curry"], num_topics=10)

for topic_words, score, topic_num in zip(topics_words, topic_scores, topic_nums):
    print(f"Topic #{topic_num} with score {score}")
    print(topic_words)

Topic #50 with score 0.684190152354502
['curry' 'steph' 'warriors' 'stephen' 'nbafinals' 'klay' 'stephgonnasteph'
 'dubnation' 'nba' 'draymond' 'csnba' 'three' 'dubs' 'durant' 'cavaliers'
 'golden' 'lebron' 'nbaonabc' 'cavs' 'kyrie' 'nbaplayoffs' 'iguodala'
 'thisiswhyweplay' 'irving' 'warriorsground' 'kd' 'layup' 'nbaallstar'
 'basketball' 'tnt' 'jblxnba' 'blazers' 'pelicans' 'harden' 'westbrook'
 'nuggets' 'bogut' 'clippers' 'nbavote' 'kerr' 'pointer' 'grizzlies' 'okc'
 'thompson' 'ginobili' 'lakers' 'cleveland' 'warrior' 'celtics' 'buzzer']
Topic #4851 with score 0.5456308964355479
['stephgonnasteph' 'csnba' 'curry' 'warriors' 'klay' 'dubnation' 'steph'
 'nbafinals' 'bogut' 'nba' 'stephen' 'cavaliers' 'kyrie' 'draymond'
 'three' 'lebron' 'thompson' 'cavs' 'golden' 'dubs' 'irving'
 'warriorsground' 'nbaplayoffs' 'tnt' 'nbaonabc' 'iguodala' 'nbavote'
 'pointer' 'durant' 'layup' 'timberwolves' 'celtics' 'jblxnba' 'blazers'
 'grizzlies' 'defendtheland' 'nbaallstar' 'thisiswhyweplay' 'wa

In [19]:
documents, doc_scores, doc_ids = model.query_documents("sponsor", num_docs=10)
for doc, score, doc_id in zip(documents, doc_scores, doc_ids):
    print(f"Document #{doc_id} with score {score}")
    print(doc)

Document #486629 with score 0.658627450466156
message sponsor tonight
Document #13246 with score 0.5395929217338562
sunday bath like not want bath
Document #181289 with score 0.5391886830329895
question not kid sleep sponsor dymadon
Document #750225 with score 0.5303393602371216
sponsor want know sponsor well ask pro
Document #486628 with score 0.5291780233383179
message sponsor wednesday
Document #786908 with score 0.5236440896987915
bath time
Document #553085 with score 0.5197162628173828
topic bath
Document #688422 with score 0.5189884901046753
know number know number learn visit knowthenumber sponsor
Document #180984 with score 0.5176442861557007
bath time feel good topic bath
Document #171849 with score 0.5152885317802429
parakeet enjoy bath parakeet enjoy bath topic bath


In [8]:
data[data.creator_name=="NBA"].text.sample(1)

91962    LeBron James... from deeeeeppp!!
Name: text, dtype: object

In [26]:
words, word_scores = model.similar_words(keywords=["uw"], num_words=10)
for word, score in zip(words, word_scores):
    print(f"{word} with score {score}")

stanford with score 0.42870877493677667
usc with score 0.387541898749018
huskies with score 0.3739953362741964
freshman with score 0.3671576293245967
ucla with score 0.3540520634050389
university with score 0.35380505298247933
campus with score 0.34902228412206987
fighton with score 0.3443369310729574
collegiate with score 0.34260423556193315
goducks with score 0.3290790011485485
