# First look at first model

In [None]:
from gensim.models import FastText

In [None]:
# The load() method must be called on the FastText class, rather than on an instance
model = FastText.load('models/last-15-years-25-epochs.ftmodel')

In [None]:
sims = model.wv.most_similar(positive=['romantic'], topn=50)
i = 0
for sim in sims:
    i += 1
    print(f'{i}. {sim[0]}, dist = {sim[1]}')

In [None]:
sims = model.wv.most_similar(positive=['self','control'], topn=50)
i = 0
for sim in sims:
    i += 1
    print(f'{i}. {sim[0]}, dist = {sim[1]}')

In [None]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize('self-development')

# Hmmm...

It doesn't seem to be throwing up anything much interesting. We could try comparing two corpora, one where 'romanticism' is a key term, and one where it isn't...

In [None]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from utils import JSTORCorpus

In [None]:
corpus = JSTORCorpus(data_dir='data/ocr', meta_dir='data/metadata')

In [None]:
idx_doc = {}
i = 0
for doc in corpus.corpus_meta:
    idx_doc[i] = doc
doc_idx = {val:key for key,val in idx_doc.items()}

In [None]:
dct = Dictionary(corpus.iter_lower)
dct.save('models/tfidf-lower-dct')

In [None]:
bow_corpus = (dct.doc2bow(text) for text in corpus.iter_lower) # Single-use generator
model = TfidfModel(bow_corpus)
model.save('models/tfidf-lower-model')

In [None]:
dct.token2id['Enlightenment']

In [None]:
'FOO'.lower()

## Have a look at topic model

In [2]:
from gensim.models import LdaModel
model = LdaModel.load('models/corpus-lower-lda')

In [3]:
from gensim.corpora import Dictionary
corp_dict = Dictionary.load('models/corpus-lower-dct')

In [4]:
from utils import JSTORCorpus
corpus = JSTORCorpus.load('data/whole-corpus.p')

Corpus loaded from data/whole-corpus.p


In [46]:
def get_word_topics(model, dct, word, min_prob=0.001):
    idx = dct.token2id[word]
    return model.get_term_topics(idx, minimum_probability=min_prob)

In [47]:
rom_tpx = get_word_topics(model, corp_dict, 'self')
print(rom_tpx)

[(4, 0.0010069308), (17, 0.0010391251), (21, 0.0014041486), (25, 0.0016450812), (28, 0.0023311207), (30, 0.0017865637), (48, 0.0014180879), (52, 0.0037318799), (63, 0.0030525278), (67, 0.029673528), (101, 0.0014650323), (118, 0.0056282943), (134, 0.002270054), (147, 0.00525864)]


In [7]:
id2token = {idx:token for token,idx in corp_dict.token2id.items()}

In [49]:
[(id2token[idx],prob) for idx,prob in model.get_topic_terms(147, topn=150)]

[(',', 0.11318597),
 ('.', 0.07170001),
 ('man', 0.012612466),
 ('world', 0.012299876),
 ('nature', 0.010386563),
 ('human', 0.009175223),
 ('p', 0.008870589),
 ('life', 0.008419744),
 ('thought', 0.0059845215),
 ('one', 0.0056345034),
 ('philosophy', 0.005592574),
 (';', 0.005579338),
 ('self', 0.005257316),
 ('mind', 0.005059469),
 ('truth', 0.0046069096),
 ('must', 0.0042758794),
 ('reality', 0.004164495),
 ('idea', 0.0037454562),
 ('ideas', 0.0037338466),
 ('-', 0.0034659472),
 ('reason', 0.0034314634),
 ('(', 0.0033529557),
 ('spirit', 0.0033460583),
 ('new', 0.0032227344),
 ('individual', 0.0031694076),
 ('things', 0.0031117226),
 ('time', 0.0030565516),
 ('knowledge', 0.003028696),
 ('god', 0.0028574807),
 ('moral', 0.0028538518),
 ('consciousness', 0.0028404212),
 ('soul', 0.0027736663),
 ('natural', 0.0026886442),
 ('existence', 0.0026755948),
 ('view', 0.0026393402),
 ('philosophical', 0.0026009579),
 ('us', 0.0025245624),
 ('spiritual', 0.0024910187),
 ('men', 0.0024868092),

In [17]:
help(model.get_term_topics)

Help on method get_term_topics in module gensim.models.ldamodel:

get_term_topics(word_id, minimum_probability=None) method of gensim.models.ldamodel.LdaModel instance
    Get the most relevant topics to the given word.
    
    Parameters
    ----------
    word_id : int
        The word for which the topic distribution will be computed.
    minimum_probability : float, optional
        Topics with an assigned probability below this threshold will be discarded.
    
    Returns
    -------
    list of (int, float)
        The relevant topics represented as pairs of their ID and their assigned probability, sorted
        by relevance to the given word.



## What about the tf-idf model

In [50]:
from gensim.models import TfidfModel
tfidf = TfidfModel.load('models/corpus-lower-tfidf')

In [51]:
from tqdm.notebook import tqdm
# Apply the model to the corpus:
bow_corpus = (corp_dict.doc2bow(text) for text in corpus.iter_lower())
romantic_scores = []
rom_idx = corp_dict.token2id['romantic']
for text in tqdm(bow_corpus, total=len(corpus)):
    # Get the index of 'romantic' in the text (text is a list of tuples of format (word_idx, freq))
    # First extract the relevant tuple:
    rom_doc_freq = [idx for idx,(word,freq) in enumerate(text) if word == 261]
    # Some documents may not have the word 'romantic', in which case give a score of zero
    if len(rom_doc_freq) == 0:
        romantic_scores.append(0)
    else:
        # Otherwise, get the index in this doc's BoW to the word 'romantic'.
        rom_doc_idx = rom_doc_freq[0]
        # Then use the extracted index to index into the output of the model.
        # This returns a tup (word_idx, tfidf_score). Extract the score.
        rom_doc_score = tfidf[text][rom_doc_idx][1]
        # Append:
        romantic_scores.append(rom_doc_score)

HBox(children=(IntProgress(value=0, max=44802), HTML(value='')))




In [None]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
rom_score_arr = np.array(romantic_scores)
plt.hist(rom_score_arr, bins=25)
plt.show()

In [None]:
import pandas as pd
df = pd.DataFrame.from_records((val for val in corpus.corpus_meta.values()))
df['rom_score'] = rom_score_arr

In [None]:
np.histogram(rom_score_arr)

In [None]:
df[(df['rom_score'] < 0.016) & (df['type'] == 'research-article')].sample(30)

We can use this tf-idf model to identify a subset of the corpus where the articles *really* talk about Romanticism. It would be better to search for more terms than 'romantic', however, as I notice that many documents at the lower end of the distribution are actually about Romantic authors like Wordsworth, Byron, Blake and so on.

Perhaps we could actually analyse the correlation of 'romantic' and its cognates with other words. In an article where 'romantic' is distinctive, what other words are also distinctive?