## Using spaCy lemmatization - maybe more human intelligible

In [None]:
import json
import string
import spacy
import math
import os

def sorted_frequencies(doc):
    frequencies = []
    for word in doc:
        entry = next((e for e in frequencies if e['word'] == word), None)
        if entry is None:
            frequencies.append({'word': word, 'count': 1})
        else:
            entry['count'] += 1
    frequencies.sort(key=lambda entry: -entry['count'])
    return frequencies

def term_frequency(term, sorted_frequencies):
    entry = next((e for e in sorted_frequencies if e['word'] == term), None)
    term_freq = entry['count']
    max_freq = sorted_frequencies[0]['count']
    return term_freq / max_freq

def inverse_document_frequency(term, corpus):
    num_occurring = 0
    for doc in corpus:
        if term in doc:
            num_occurring += 1
    return math.log2(len(corpus)/num_occurring)

def tfidf(term, corpus, sorted_frequencies, word_document_frequencies):
    idf = math.log2(len(corpus)/word_document_frequencies[term])
    return term_frequency(term, sorted_frequencies) * idf

##############################
# THIS IS THE ONLY NEW THING #
##############################
def prep_doc_spacy(filepath, nlp_spacy):
    with open(filepath) as file:
        doc = json.load(file)

        # get text and lowercase it, then combine
        text = ' '.join([chunk['text'].lower() for chunk in doc['body_text']])
        # remove punctuation
        text = ''.join([char for char in text if char not in string.punctuation])
        #print('First 100 words - lower case, no puctuation')
        #print(text[0:100])
        # tokenize, remove stopwords
        doc_spacy = nlp_spacy(text)
        no_stopwords = [word for word in doc_spacy if not word.is_stop]
        # lemmatization
        lemmas = [word.lemma_ for word in no_stopwords]
        #print('First 20 lemmas')
        #print(lemmas[0:20])
        return lemmas
##############################

##############################

def gen_corpus_frequencies(corpus):
    # get all words in corpus
    word_sets = [set(doc) for doc in docs]
    corpus_words = set()
    for doc in word_sets:
        corpus_words.update(doc)
    # get number of docs in which each word occurs
    wordcounts = dict.fromkeys(corpus_words)
    for word in wordcounts:
        wordcounts[word] = 0
        for doc in word_sets:
            if word in doc:
                wordcounts[word] += 1
    return wordcounts

limit = 10
i = 0
filepaths = []
dirpath = '../input/CORD-19-research-challenge/document_parses/pdf_json'
for path in os.listdir(dirpath):
    i += 1
    if i > limit:
        break
    filepaths.append(os.path.join(dirpath, path))

#print(filepaths)
nlp_spacy = spacy.load('en_core_web_sm')
docs = [prep_doc_spacy(path, nlp_spacy) for path in filepaths]

doc = docs[0]

doc_stats = sorted_frequencies(doc)
word_document_frequencies = gen_corpus_frequencies(docs)

for entry in doc_stats:
    entry['tfidf'] = tfidf(entry['word'], docs, doc_stats, word_document_frequencies)
# include this in doc_stats method?

doc_stats.sort(key=lambda entry: -entry['tfidf'])

for entry in doc_stats[0:10]:
    print(entry['word'], entry['tfidf'])

Output from cell above:  
brazilian 3.321928094887362  
pardo 2.9264604645436285  
north 2.6100863602686415  
ethnicity 2.3728057820624016  
centralsouth 1.977338151718668  
ethnic 1.740057573512428  
region 1.654252946824958  
brazil 1.5818705213749342  
rio 1.2654964170999474  
likely 1.1864028910312008  