# `pyLDAvis.sklearn`

Modified from the pyLDAvis scikit-learn example notebook.

In [1]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import re, io, os
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
import urllib.request

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Load Voynich Data

Load the Pickle files produced by vms_vectorize.py

In [3]:
def tokenize(data):
    index = defaultdict(str)
    
    with urllib.request.urlopen(data) as file:
        for line in file.read().decode('latin-1').splitlines():
            # pull out takahashi lines
            m = re.match(r'^<(f.*?)\..*;H> +(\S.*)$', line)
            if not m:
                continue

            transcription = m.group(2)
            pg = str(m.group(1))

            # ignore entire line if it has a {&NNN} or {&.} code
            if re.search(r'\{&(\d|\.)+\}', transcription):
                continue

            # remove extraneous chracters ! and %
            s = transcription.replace("!", "").replace("%", "")
            
            # delete all end of line {comments} (between one and three observed)
            # ...with optional line terminator
            # allow 0 occurences to remove end-of-line markers (- or =)
            s = re.sub(r'([-=]?\{[^\{\}]+?\}){0,3}[-=]?\s*$', "", s)

            # delete start of line {comments} (single or double)
            s = re.sub(r'^(\{[^\{\}]+?\}){1,2}', "", s)

            # simplification: tags preceeded by -= are word breaks
            s = re.sub(r'[-=]\{[^\{\}]+?\}', '.', s)

            # these tags are nulls
            # plant is a null in one case where it is just {plant}
            # otherwise (above) it is a word break
            # s = re.sub(r'\{(fold|crease|blot|&\w.?|plant)\}', "", s)
            # simplification: remaining tags in curly brackets
            s = re.sub(r'\{[^\{\}]+?\}', '', s)

            # special case .{\} is still a word break
            s = re.sub(r'\.\{\\\}', ".", s)

            # split on word boundaries
            # exclude null words ('')
            words = [str(w) for w in s.split(".") if w]
            paragraph = ' '.join(words).lstrip()
            
            index[pg] += (paragraph)

    return index


In [4]:
models_path = "./models"

# load in the pickle files of stored models
with open("{}/tfidf_vectorizer.pk".format(models_path), "rb") as f:
    tfidf_vectorizer = pickle.load(f)
with open("{}/vms_tf.pk".format(models_path), "rb") as f:
    vms_tf = pickle.load(f)
with open("{}/vms_mapping.pk".format(models_path), "rb") as f:
    vms_mapping = pickle.load(f)
with open("{}/tf_vectorizer.pk".format(models_path), "rb") as f:
    tf_vectorizer = pickle.load(f)
with open("{}/vms_tfidf.pk".format(models_path), "rb") as f:
    vms_tfidf = pickle.load(f)

num_topics = 6

index = tokenize("https://raw.githubusercontent.com/rachelbari/voynich-topic-modeling/master/data/text16e6.evt")
documents = [index[key] for key in index.keys()]

#newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
#docs_raw = newsgroups.data
#print(len(docs_raw))



In [5]:
# make a dataframe of mapping material
map_df = pd.DataFrame(vms_mapping, columns=['folio'])
print(map_df)
print(vms_mapping)


     folio
0      f1r
1      f1v
2      f2r
3      f2v
4      f3r
..     ...
220  f114r
221  f114v
222  f115r
223  f115v
224  f116r

[225 rows x 1 columns]
['f1r', 'f1v', 'f2r', 'f2v', 'f3r', 'f3v', 'f4r', 'f4v', 'f5r', 'f5v', 'f6r', 'f6v', 'f7r', 'f7v', 'f8r', 'f8v', 'f9r', 'f9v', 'f10r', 'f10v', 'f11r', 'f11v', 'f13r', 'f13v', 'f14r', 'f14v', 'f15r', 'f15v', 'f16r', 'f16v', 'f17r', 'f17v', 'f18r', 'f18v', 'f19r', 'f19v', 'f20r', 'f20v', 'f21r', 'f21v', 'f22r', 'f22v', 'f23r', 'f23v', 'f24r', 'f24v', 'f25r', 'f25v', 'f26r', 'f26v', 'f27r', 'f27v', 'f28r', 'f28v', 'f29r', 'f29v', 'f30r', 'f30v', 'f31r', 'f31v', 'f32r', 'f32v', 'f33r', 'f33v', 'f34r', 'f34v', 'f35r', 'f35v', 'f36r', 'f36v', 'f37r', 'f37v', 'f38r', 'f38v', 'f39r', 'f39v', 'f40r', 'f40v', 'f41r', 'f41v', 'f42r', 'f42v', 'f43r', 'f43v', 'f44r', 'f44v', 'f45r', 'f45v', 'f46r', 'f46v', 'f47r', 'f47v', 'f48r', 'f48v', 'f49r', 'f49v', 'f50r', 'f50v', 'f51r', 'f51v', 'f52r', 'f52v', 'f53r', 'f53v', 'f54r', 'f54v', 'f55r', 'f55v

## Fit Latent Dirichlet Allocation models

Finally, the LDA models are fitted.

In [6]:
# for TF
lda_tf = LatentDirichletAllocation(n_components=num_topics, random_state=0, max_iter=8)#, learning_method="online", learning_offset=20.)
lda_tf.fit(vms_tf)
# for TFIDF
#lda_tfidf = LatentDirichletAllocation(n_components=num_topics, random_state=0, max_iter=8)#, learning_method="online", learning_offset=20.)
#lda_tfidf.fit(vms_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=8,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

## Visualizing the models with pyLDAvis

In [7]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [8]:
#pyLDAvis.sklearn.prepare(lda_tfidf, vms_tfidf, tfidf_vectorizer)

### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [9]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer, mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [10]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
