# Description:
In this notebook we compute the topic coherence of the topics obtained by BERTopic on the newsapi corpus.

In [1]:
from requests import request
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import json

In [3]:
# Get documents
req = {"filters": [], "batch_size": 10000}
response_generator = request('get', url='http://0.0.0.0:8000/all-docs-generator', json=req, stream=True).\
    iter_lines(delimiter=b"#SEP#")

docs = []
i = 0
for line in response_generator:
    if i % req['batch_size'] == 0:
        if i == 0:
            print("Begin generator.")
        else:
            print(f"Iteration done: {i}.")
    # Filter out keep-alive new lines
    if line:
        doc = json.loads(line.decode('utf-8'))
        answer = doc["answer"]
        if answer:
            docs.append(answer.replace('#SEPTAG#', ' '))
            i += 1

Begin generator.
Iteration done: 10000.
Iteration done: 10000.
Iteration done: 20000.
Iteration done: 20000.
Iteration done: 30000.
Iteration done: 30000.
Iteration done: 40000.
Iteration done: 40000.
Iteration done: 50000.
Iteration done: 50000.
Iteration done: 60000.
Iteration done: 60000.
Iteration done: 70000.
Iteration done: 70000.
Iteration done: 80000.
Iteration done: 80000.
Iteration done: 90000.
Iteration done: 90000.
Iteration done: 100000.
Iteration done: 100000.
Iteration done: 110000.
Iteration done: 110000.
Iteration done: 120000.
Iteration done: 120000.
Iteration done: 130000.
Iteration done: 130000.
Iteration done: 140000.
Iteration done: 140000.
Iteration done: 150000.
Iteration done: 150000.
Iteration done: 160000.
Iteration done: 160000.
Iteration done: 170000.
Iteration done: 170000.
Iteration done: 180000.
Iteration done: 180000.
Iteration done: 190000.
Iteration done: 190000.
Iteration done: 200000.
Iteration done: 200000.
Iteration done: 210000.
Iteration done: 2

In [5]:
# Get texts and dictionary
texts = list(map(lambda x: x.split(' '), docs))
dictionary = Dictionary(texts)

In [6]:
# Get tokenized topics
r = request('get', url='http://0.0.0.0:8000/topic-names').json()
topics = [i.split('_')[1:] for i in r['topic_names']][1:]

In [29]:
# Compute the coherence for each topic
cm = CoherenceModel(
    topics=topics, 
    texts=texts, 
    dictionary=dictionary, 
    coherence='c_v',
    topn=5
)

coherences = dict(zip([" ".join(t) for t in topics], cm.get_coherence_per_topic()))

In [30]:
coherences

{'covid covid 19 vaccine coronavirus': 0.6439541664091119,
 'netflix old actor movie': 0.22302934121499937,
 'nfl football team coach': 0.3589343094716429,
 'trump election republican elect': 0.12039803721881434,
 'nba lakers james warriors': 1.0,
 'bitcoin cyberpunk 2077 cyberpunk 2077': 0.3361138688779574,
 'stocks china stock market': 0.41290678380036705,
 'space nasa spacex planet': 0.3137676472816739,
 'apple iphone iphone 12 oneplus': 1.0,
 'nintendo xbox game ps5': 1.0,
 'prince johnson harry meghan': 1.0,
 'tesla electric elon musk vehicles': 0.9431743025877409,
 'gamestop fortnite intel stock': 0.21201869514778282,
 'league manchester liverpool premier league': 1.0,
 'mlb mets baseball dodgers': 0.3555204218364344,
 'india england cricket zealand': 1.0,
 'wwe ufc wrestlemania wrestling': 1.0,
 'juventus milan roma napoli': 1.0,
 'samsung galaxy s21 android samsung galaxy': 1.0,
 'startup funding stimulus raises': 0.6165156598748913}