In [1]:
# TF-IDF example -- can be run independently

# imports and set up logging
import gensim 
import logging
import glob, os
import nltk
from nltk import word_tokenize
from pprint import pprint
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# directory containing all source texts for training the model 
data_dir="../corpus"
os.chdir(data_dir)

lemmatized_corpus = []   
original_corpus = []     

tfdocuments = []
for filename in glob.glob("*.txt"):
    filedata = open(filename, 'r').read()
    tfdocuments.append(filedata)

print("Number of documents: " + str(len(tfdocuments)))
    
for xdoc in tfdocuments:               
    tokens = word_tokenize(xdoc)     
    lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(token.lower()) for token in tokens]
    lemmatized_corpus.append(lemmas)    
    original_corpus.append(tokens)    

dictionary = Dictionary(lemmatized_corpus)   # Build the dictionary

# Convert to vector corpus
vectors = [dictionary.doc2bow(text) for text in lemmatized_corpus]

# Build TF-IDF model
tfidf = TfidfModel(vectors)

Number of documents: 42


2019-05-06 17:02:46,505 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-05-06 17:02:47,008 : INFO : built Dictionary(23341 unique tokens: ['!', '(', ')', ',', '.']...) from 42 documents (total 1087136 corpus positions)
2019-05-06 17:02:47,412 : INFO : collecting document frequencies
2019-05-06 17:02:47,413 : INFO : PROGRESS: processing document #0
2019-05-06 17:02:47,455 : INFO : calculating IDF weights for 42 documents and 23340 features (131075 matrix non-zeros)


In [2]:
# Get TF-IDF weights
weights = tfidf[vectors[0]]

# Get terms from the dictionary and pair with weights
weights = [(dictionary[pair[0]], pair[1]) for pair in weights]

pprint(sorted(weights, key=lambda weights: weights[1], reverse=True)[1:50])

[('percy', 0.3778585558451681),
 ('douglas', 0.2658940827268987),
 ('harry', 0.2259219125965405),
 ('mortimer', 0.22178654364825082),
 ('falstaff', 0.17250064505975066),
 ('glendower', 0.13964337933408386),
 ('francis', 0.13770568627722468),
 ('scot', 0.13142906290266715),
 ('john', 0.12589087534249327),
 ('wale', 0.12518698752474972),
 ('poins', 0.11063169225788345),
 ('westmoreland', 0.11063169225788345),
 ('jack', 0.100638582605262),
 ('worcester', 0.09857179717700036),
 ('ostler', 0.09140047130714636),
 ('hotspur', 0.08508610647260757),
 ('owen', 0.08508610647260757),
 ('sack', 0.08193583970552355),
 ('buckram', 0.07445034316353162),
 ('peto', 0.07445034316353162),
 ('bardolph', 0.07392884788275027),
 ('welsh', 0.07392884788275027),
 ('kate', 0.06885284313861234),
 ('zounds', 0.06797860810497708),
 ('holmedon', 0.06528605093367597),
 ('sheriff', 0.06453515381709868),
 ('ned', 0.06118074729447937),
 ('i', 0.06044311631758438),
 ('sblood', 0.05947827720976723),
 ('instinct', 0.056334

In [3]:
# Check the answer Part 1: 
# See in which document (=file) the word occurs, and return the raw number of tokens

def word_freq(search_term):
    lemmacounts = []
    for filename in glob.glob("*.txt"):
        filedata = open(filename, 'r').read()
        tokens = word_tokenize(filedata)     
        lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(token.lower()) for token in tokens]
        lemmacounts.append([filename, lemmas.count(search_term)])
    return lemmacounts
        
word_freq("percy")  #douglas #king

[['1H4_h.txt', 46],
 ['1H6_h.txt', 1],
 ['2H4_h.txt', 8],
 ['2H6_h.txt', 0],
 ['3H6_h.txt', 0],
 ['Ado_c.txt', 0],
 ['Ant_t.txt', 0],
 ['AWW_c.txt', 0],
 ['AYL_c.txt', 0],
 ['Cor_t.txt', 0],
 ['Cym_t.txt', 0],
 ['Err_c.txt', 0],
 ['H5_h.txt', 0],
 ['H8_h.txt', 0],
 ['Ham_t.txt', 0],
 ['JC_t.txt', 0],
 ['John_t.txt', 0],
 ['Lear_t.txt', 0],
 ['LLL_c.txt', 0],
 ['Lucrece_x.txt', 0],
 ['M4M_c.txt', 0],
 ['Mac_t.txt', 0],
 ['MerchV_c.txt', 0],
 ['MND_c.txt', 0],
 ['Oth_t.txt', 0],
 ['Pericles_x.txt', 0],
 ['PhxTur_x.txt', 0],
 ['R2_h.txt', 4],
 ['R3_h.txt', 0],
 ['Rom_t.txt', 0],
 ['Shr_c.txt', 0],
 ['Sonnets_x.txt', 0],
 ['TGV_c.txt', 0],
 ['Tim_t.txt', 0],
 ['Tit_t.txt', 0],
 ['Tmp_c.txt', 0],
 ['TN_c.txt', 0],
 ['TNK_x.txt', 0],
 ['Tro_c.txt', 0],
 ['VenusAdonis_x.txt', 0],
 ['Wiv_c.txt', 0],
 ['WT_c.txt', 0]]

In [8]:
# Check the answer Part 2: 
#   Open the document (=file) and print the number of times the word appears,
#   the total number of tokens, and the term frequency relative to the total.

# directory containing all source texts for training the model 
data_dir="../corpus"
os.chdir(data_dir)

def freq_check(filename, search_term):
    filedata = open(filename, 'r').read()
    tokens = word_tokenize(filedata)     
    lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(token.lower()) for token in tokens]
    resultstr = "In file " + filename + ", the term '" + search_term + "' exists " + str(lemmas.count(search_term)) + " times" + \
    " compared to " + str(len(lemmas)) + " total lemmatized tokens for a raw frequency of " + str((lemmas.count(search_term) / len(lemmas)))
    return resultstr

freq_check('1H4_h.txt','percy')  #percy

"In file 1H4_h.txt, the term 'percy' exists 46 times compared to 29288 total lemmatized tokens for a raw frequency of 0.0015706091231903851"

In [10]:
# As a final check, provide the most frequent lemmas for that document (=file)
from pprint import pprint
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

# directory containing all source texts for training the model 
data_dir="../corpus"
filename="1H4_h.txt"
os.chdir(data_dir)

filedata = open(filename, 'r').read()
xtokens = word_tokenize(filedata)     
xlemmas = [nltk.stem.WordNetLemmatizer().lemmatize(token.lower()) for token in xtokens]

# Stopwords and punctuation removal
stop = stopwords.words('english') + list(string.punctuation)

tokens_nsp = [i.lower() for i in xtokens if i.lower() not in stop]
lemmas_nsp = [i.lower() for i in xlemmas if i.lower() not in stop]

##

print ("\nTokens:")

wfreq = []
for w in set(tokens_nsp):
    wfreq.append([w, tokens_nsp.count(w)])
wfreq_sorted = sorted(wfreq, key=lambda wfreq: wfreq[1], reverse=True)
pprint(wfreq_sorted[0:50])

print ("\nLemmas:")

lfreq = []
for w in set(lemmas_nsp):
    lfreq.append([w, lemmas_nsp.count(w)])
lfreq_sorted = sorted(lfreq, key=lambda lfreq: lfreq[1], reverse=True)
pprint(lfreq_sorted[0:50])




Tokens:
[['’', 406],
 ['thou', 253],
 ['shall', 134],
 ['lord', 123],
 ['—', 118],
 ['thy', 111],
 ['thee', 102],
 ['good', 82],
 ['come', 80],
 ['king', 75],
 ['well', 74],
 ['us', 66],
 ['upon', 64],
 ['man', 61],
 ['like', 60],
 ['let', 59],
 ['art', 59],
 ['would', 58],
 ['sir', 51],
 ['hath', 51],
 ['tell', 49],
 ['god', 48],
 ['percy', 46],
 ['time', 45],
 ['yet', 44],
 ['make', 42],
 ['never', 42],
 ['know', 41],
 ['hear', 40],
 ['say', 40],
 ['give', 39],
 ['harry', 39],
 ['go', 37],
 ['hal', 37],
 ['men', 37],
 ['may', 37],
 ['hast', 36],
 ['john', 35],
 ['see', 34],
 ['jack', 34],
 ['think', 33],
 ['faith', 33],
 ['one', 33],
 ['father', 33],
 ['horse', 32],
 ['prince', 32],
 ['true', 31],
 ['must', 31],
 ['life', 31],
 ['day', 31]]

Lemmas:
[['’', 406],
 ['thou', 253],
 ['shall', 134],
 ['lord', 129],
 ['—', 118],
 ['thy', 111],
 ['thee', 102],
 ['come', 92],
 ['good', 82],
 ['king', 76],
 ['well', 74],
 ['u', 66],
 ['wa', 65],
 ['upon', 64],
 ['man', 61],
 ['like', 60],
 [