# `pyLDAvis.sklearn`

Modified from the pyLDAvis scikit-learn example notebook.

In [1]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import re, io, os
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
import urllib.request

ModuleNotFoundError: No module named 'pyLDAvis'

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Load Voynich Data

Load the Pickle files produced by vms_vectorize.py

In [None]:
def tokenize(data):
    index = defaultdict(str)
    
    with urllib.request.urlopen(data) as file:
        for line in file.read().decode('latin-1').splitlines():
            # pull out takahashi lines
            m = re.match(r'^<(f.*?)\..*;H> +(\S.*)$', line)
            if not m:
                continue

            transcription = m.group(2)
            pg = str(m.group(1))

            # ignore entire line if it has a {&NNN} or {&.} code
            if re.search(r'\{&(\d|\.)+\}', transcription):
                continue

            # remove extraneous chracters ! and %
            s = transcription.replace("!", "").replace("%", "")
            
            # delete all end of line {comments} (between one and three observed)
            # ...with optional line terminator
            # allow 0 occurences to remove end-of-line markers (- or =)
            s = re.sub(r'([-=]?\{[^\{\}]+?\}){0,3}[-=]?\s*$', "", s)

            # delete start of line {comments} (single or double)
            s = re.sub(r'^(\{[^\{\}]+?\}){1,2}', "", s)

            # simplification: tags preceeded by -= are word breaks
            s = re.sub(r'[-=]\{[^\{\}]+?\}', '.', s)

            # these tags are nulls
            # plant is a null in one case where it is just {plant}
            # otherwise (above) it is a word break
            # s = re.sub(r'\{(fold|crease|blot|&\w.?|plant)\}', "", s)
            # simplification: remaining tags in curly brackets
            s = re.sub(r'\{[^\{\}]+?\}', '', s)

            # special case .{\} is still a word break
            s = re.sub(r'\.\{\\\}', ".", s)

            # split on word boundaries
            # exclude null words ('')
            words = [str(w) for w in s.split(".") if w]
            paragraph = ' '.join(words).lstrip()
            
            index[pg] += (paragraph)

    return index


In [None]:
models_path = "./models"

# load in the pickle files of stored models
with open("{}/tfidf_vectorizer.pk".format(models_path), "rb") as f:
    tfidf_vectorizer = pickle.load(f)
with open("{}/vms_tf.pk".format(models_path), "rb") as f:
    vms_tf = pickle.load(f)
with open("{}/vms_mapping.pk".format(models_path), "rb") as f:
    vms_mapping = pickle.load(f)
with open("{}/tf_vectorizer.pk".format(models_path), "rb") as f:
    tf_vectorizer = pickle.load(f)
with open("{}/vms_tfidf.pk".format(models_path), "rb") as f:
    vms_tfidf = pickle.load(f)

num_topics = 4

index = tokenize("https://raw.githubusercontent.com/rachelbari/voynich-topic-modeling/master/data/text16e6.evt")
documents = [index[key] for key in index.keys()]

#newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
#docs_raw = newsgroups.data
#print(len(docs_raw))

In [None]:
# make a dataframe of mapping material
map_df = pd.DataFrame(vms_mapping, columns=['folio'])
print(map_df)
print(vms_mapping)


## Fit Latent Dirichlet Allocation models

Finally, the LDA models are fitted.

In [None]:
# for TF
lda_tf = LatentDirichletAllocation(n_topics=num_topics, random_state=0, max_iter=8)#, learning_method="online", learning_offset=20.)
lda_tf.fit(vms_tf)
# for TFIDF
lda_tfidf = LatentDirichletAllocation(n_topics=num_topics, random_state=0, max_iter=8)#, learning_method="online", learning_offset=20.)
lda_tfidf.fit(vms_tfidf)

## Visualizing the models with pyLDAvis

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer)

In [None]:
pyLDAvis.sklearn.prepare(lda_tfidf, vms_tfidf, tfidf_vectorizer)

### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer, mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, vms_tf, tf_vectorizer, mds='tsne')