In [1]:
import json

In [2]:
data = None
with open("./sampleData.json", "r") as rfile:
    data = json.load(rfile)

In [3]:
data.keys()

dict_keys(['query', 'urls', 'dateTo', 'dateFrom', 'queryData', 'topk', 'isTest', 'success'])

In [4]:
import spacy

In [5]:
from gensim import corpora

In [6]:
nlp = spacy.load("en_core_web_md")

In [7]:
data = data['queryData']

In [8]:
data.keys()

dict_keys(['raw', 'filtered', 'metadata'])

In [9]:
urls = list(data['raw'].keys())

In [24]:
def is_not_sword_or_punc(word):
    return word not in nlp.Defaults.stop_words and not re.match(r"\W+", word) and len(word) > 1

In [25]:
import re

def get_lemmatized_tokens(data, nlp):
    urls = list(data['raw'].keys())
    
    token_lists = []
    
    for url in urls:
        
        intermediate_list = []
        
        for paragraph in data['raw'][url]:
            intermediate_list = intermediate_list + re.split(r"\W+", paragraph)
        
        intermediate_string = " ".join(intermediate_list)
        
        spacy_doc = nlp(intermediate_string)
        
        token_lists.append([token.lemma_ for token in spacy_doc if is_not_sword_or_punc(token.lemma_)])
        
    return token_lists
    
    
        

In [26]:
token_lists = get_lemmatized_tokens(data,nlp)

In [27]:
len(token_lists)

20

In [28]:
from gensim.models import LdaModel

vocab = corpora.Dictionary(token_lists)

corpus = [vocab.doc2bow(doc) for doc in token_lists]

model = LdaModel(corpus, num_topics = 4)

In [29]:
topics = model.top_topics(corpus = corpus, dictionary = vocab)

In [30]:
for i,topic in enumerate(topics):
    print("\n\nTOPIC {}".format(i+1))
    for term in topic[0]:
        print(term[0], vocab[int(term[1])])



TOPIC 1
0.00892274 covid
0.006846373 people
0.0057428735 sabrina
0.005556836 new
0.0054823095 china
0.0043376754 virus
0.00432429 tavernise
0.0041669314 amy
0.0040354496 qin
0.0038937747 like
0.0037636587 long
0.0037039728 pandemic
0.0035963221 year
0.0033968552 2022
0.0033165014 vaccine
0.0032803721 case
0.003170891 think
0.003117806 time
0.0030668995 zero
0.0030541862 day


TOPIC 2
0.008144169 covid
0.006951075 vaccine
0.0061557344 tavernise
0.0053656534 china
0.00505541 people
0.00487408 qin
0.0048184455 time
0.004171755 amy
0.004142185 site
0.0041364045 new
0.0040115616 sabrina
0.0040069656 like
0.0038761094 child
0.00385332 government
0.003575336 virus
0.0034550661 policy
0.0033168977 test
0.0032680149 find
0.0032244485 19
0.0032035809 zero


TOPIC 3
0.018754967 covid
0.0062659415 vaccine
0.0058520725 china
0.005053513 new
0.0049455096 people
0.004650744 19
0.004556988 long
0.0043863747 health
0.0041361237 pandemic
0.004103204 case
0.0040363106 like
0.0038686811 amy
0.0035866143

In [31]:
model[corpus[13]]

[(0, 0.99810725)]

In [34]:
def lda_k_topics(token_lists, k = 4):
    
    vocab = corpora.Dictionary(token_lists)

    corpus = [vocab.doc2bow(doc) for doc in token_lists]

    model = LdaModel(corpus, num_topics = k)
    
    topics = model.top_topics(corpus = corpus, dictionary = vocab)
    
    topic_terms = []
    
    for i,topic in enumerate(topics):
            
        topic_terms.append(", ".join([vocab[int(term[1])] for term in topic[0][:5]])) 
        
        print("\nTOPIC\n {}".format(i+1), topic_terms[-1])
       
    doc_topics = []
    
    for doc in corpus:
        
        best_topics = sorted(model[doc], key=lambda a:a[1], reverse=True)
        
        doc_topics.append(best_topics[0][0])
    
    for i,topic in enumerate(doc_topics):
        print("Topics for Doc {}: ".format(i), topic)

    return topic_terms, doc_topics

In [35]:
_ = lda_k_topics(token_lists, k=4)


TOPIC
 1 covid, vaccine, people, health, new

TOPIC
 2 covid, new, vaccine, people, long

TOPIC
 3 covid, people, china, sabrina, tavernise

TOPIC
 4 covid, china, vaccine, people, 19
Topics for Doc 0:  3
Topics for Doc 1:  1
Topics for Doc 2:  3
Topics for Doc 3:  0
Topics for Doc 4:  1
Topics for Doc 5:  2
Topics for Doc 6:  2
Topics for Doc 7:  2
Topics for Doc 8:  2
Topics for Doc 9:  1
Topics for Doc 10:  2
Topics for Doc 11:  2
Topics for Doc 12:  1
Topics for Doc 13:  2
Topics for Doc 14:  1
Topics for Doc 15:  2
Topics for Doc 16:  2
Topics for Doc 17:  2
Topics for Doc 18:  2
Topics for Doc 19:  2


In [152]:
data['raw'][urls[0]]

['skip to contentskip to site index',
 '',
 'the coronavirus',
 'pandemic',
 'covid-19 updates',
 'coronavirus map and cases',
 'mask questions, answered',
 'long covid’s toll',
 'advertisement',
 '',
 'continue reading the main story',
 '',
 'the ethicist',
 '',
 'can i reveal a colleague’s covid diagnosis?',
 'the magazine’s ethicist columnist on whether to keep a coworker’s illness confidential, outing an unvaccinated colleague — and more.',
 '',
 '',
 '453',
 '',
 '',
 'credit...illustration by tomi um',
 'by kwame anthony appiah',
 'jan. 25, 2022',
 'i work in an office with cubicles six feet apart, but we are all up and about throughout the day. we are supposed to wear a mask every time we step outside our cubicles, but for short interactions that often does not happen. the c.d.c. says the risk is higher if people are within six feet of one another for a total of 15 minutes in a day. that is probably the case for all of us. my closest cubicle mate is unvaccinated and came down wi