# `word2vec`

Using gensim to train a word2vec model on Voynichese.

This is the same model that brought you ("King" - "Man") + "Woman" = "Queen"

In [1]:
import re, io, os
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
import urllib.request
import random
random.seed(13)

In [2]:
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE

from gensim.models import Word2Vec

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns

## Helper Functions

In [3]:
def tokenize(data):
    index = defaultdict(str)
    
    with urllib.request.urlopen(data) as file:
        for line in file.read().decode('latin-1').splitlines():
            # pull out takahashi lines
            m = re.match(r'^<(f.*?)\..*;H> +(\S.*)$', line)
            if not m:
                continue

            transcription = m.group(2)
            pg = str(m.group(1))

            # ignore entire line if it has a {&NNN} or {&.} code
            if re.search(r'\{&(\d|\.)+\}', transcription):
                continue

            # remove extraneous chracters ! and %
            s = transcription.replace("!", "").replace("%", "")
            
            # delete all end of line {comments} (between one and three observed)
            # ...with optional line terminator
            # allow 0 occurences to remove end-of-line markers (- or =)
            s = re.sub(r'([-=]?\{[^\{\}]+?\}){0,3}[-=]?\s*$', "", s)

            # delete start of line {comments} (single or double)
            s = re.sub(r'^(\{[^\{\}]+?\}){1,2}', "", s)

            # simplification: tags preceeded by -= are word breaks
            s = re.sub(r'[-=]\{[^\{\}]+?\}', '.', s)

            # these tags are nulls
            # plant is a null in one case where it is just {plant}
            # otherwise (above) it is a word break
            # s = re.sub(r'\{(fold|crease|blot|&\w.?|plant)\}', "", s)
            # simplification: remaining tags in curly brackets
            s = re.sub(r'\{[^\{\}]+?\}', '', s)

            # special case .{\} is still a word break
            s = re.sub(r'\.\{\\\}', ".", s)

            # split on word boundaries
            # exclude null words ('')
            words = [str(w) for w in s.split(".") if w]
            paragraph = ' '.join(words).lstrip()
            
            index[pg] += (paragraph)

    return index


## Load Voynich Data

Load the Pickle files produced by vms_vectorize.py

In [10]:
models_path = "./models"

# load in the pickle files of stored models
with open("{}/tfidf_vectorizer.pk".format(models_path), "rb") as f:
    tfidf_vectorizer = pickle.load(f)
with open("{}/vms_tf.pk".format(models_path), "rb") as f:
    vms_tf = pickle.load(f)
with open("{}/vms_mapping.pk".format(models_path), "rb") as f:
    vms_mapping = pickle.load(f)
with open("{}/tf_vectorizer.pk".format(models_path), "rb") as f:
    tf_vectorizer = pickle.load(f)
with open("{}/vms_tfidf.pk".format(models_path), "rb") as f:
    vms_tfidf = pickle.load(f)

num_topics = 4

index = tokenize("https://raw.githubusercontent.com/rachelbari/voynich-topic-modeling/master/data/text16e6.evt")
documents = [index[key] for key in index.keys()]

print(documents)

#newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
#docs_raw = newsgroups.data
#print(len(docs_raw))

['fachys ykal ar ataiin shol shory cthres y kor sholdysory ckhar or y kair chtaiin shar are cthar cthar dansyaiir sheky or ykaiin shod cthoary cthes daraiin saooiin oteey oteos roloty cth*ar daiin otaiin or okandair y chear cthaiin cphar cfhaiinydaraishyyshey shody okchoy otchol chocthy oschy dain chor kosdaiin shos cfhol shodydain os teodyoksho kshoy otairin oteol okan shodain sckhey daiinshoy ckhey kodaiin cphy cphodaiils cthey she oldain ddain oiin chol odaiin chodain chdy okain dan cthy koddaiin shckhey ckeor chor shey kol chol chol kor chalsho chol shodan kshy kchy dor chodaiin sho kchomycho tchey chokain sheo pshol dydyd cthy daicthyyto shol she kodshey cphealy dasain dain ckhydsdchar shcthaiin okaiir chey rchy potol cthols dloctashok chor chey dain ckheyotol daiiincpho shaiin shokcheey chol tshodeesy shey pydeey chy ro d**doin chol dain cthal dar shear kaiin dar shey ctharcho*o kaiin shoaiin okol daiin far cthol daiin ctholdarycheey okay oky daiin okchey kokaiin **chol k**chy da

In [5]:
# make a dataframe of mapping material
map_df = pd.DataFrame(vms_mapping, columns=['folio'])
print(map_df)
print(vms_mapping)


     folio
0      f1r
1      f1v
2      f2r
3      f2v
4      f3r
..     ...
220  f114r
221  f114v
222  f115r
223  f115v
224  f116r

[225 rows x 1 columns]
['f1r', 'f1v', 'f2r', 'f2v', 'f3r', 'f3v', 'f4r', 'f4v', 'f5r', 'f5v', 'f6r', 'f6v', 'f7r', 'f7v', 'f8r', 'f8v', 'f9r', 'f9v', 'f10r', 'f10v', 'f11r', 'f11v', 'f13r', 'f13v', 'f14r', 'f14v', 'f15r', 'f15v', 'f16r', 'f16v', 'f17r', 'f17v', 'f18r', 'f18v', 'f19r', 'f19v', 'f20r', 'f20v', 'f21r', 'f21v', 'f22r', 'f22v', 'f23r', 'f23v', 'f24r', 'f24v', 'f25r', 'f25v', 'f26r', 'f26v', 'f27r', 'f27v', 'f28r', 'f28v', 'f29r', 'f29v', 'f30r', 'f30v', 'f31r', 'f31v', 'f32r', 'f32v', 'f33r', 'f33v', 'f34r', 'f34v', 'f35r', 'f35v', 'f36r', 'f36v', 'f37r', 'f37v', 'f38r', 'f38v', 'f39r', 'f39v', 'f40r', 'f40v', 'f41r', 'f41v', 'f42r', 'f42v', 'f43r', 'f43v', 'f44r', 'f44v', 'f45r', 'f45v', 'f46r', 'f46v', 'f47r', 'f47v', 'f48r', 'f48v', 'f49r', 'f49v', 'f50r', 'f50v', 'f51r', 'f51v', 'f52r', 'f52v', 'f53r', 'f53v', 'f54r', 'f54v', 'f55r', 'f55v

## Train the Word2Vec neural net

In [11]:
# turn each document into an array of words, rather than a long string
print(documents[0])
for i in range(len(documents)):
    doc = documents[i]
    doc = doc.split(" ")
    documents[i] = doc
print(documents[0])

fachys ykal ar ataiin shol shory cthres y kor sholdysory ckhar or y kair chtaiin shar are cthar cthar dansyaiir sheky or ykaiin shod cthoary cthes daraiin saooiin oteey oteos roloty cth*ar daiin otaiin or okandair y chear cthaiin cphar cfhaiinydaraishyyshey shody okchoy otchol chocthy oschy dain chor kosdaiin shos cfhol shodydain os teodyoksho kshoy otairin oteol okan shodain sckhey daiinshoy ckhey kodaiin cphy cphodaiils cthey she oldain ddain oiin chol odaiin chodain chdy okain dan cthy koddaiin shckhey ckeor chor shey kol chol chol kor chalsho chol shodan kshy kchy dor chodaiin sho kchomycho tchey chokain sheo pshol dydyd cthy daicthyyto shol she kodshey cphealy dasain dain ckhydsdchar shcthaiin okaiir chey rchy potol cthols dloctashok chor chey dain ckheyotol daiiincpho shaiin shokcheey chol tshodeesy shey pydeey chy ro d**doin chol dain cthal dar shear kaiin dar shey ctharcho*o kaiin shoaiin okol daiin far cthol daiin ctholdarycheey okay oky daiin okchey kokaiin **chol k**chy dald

In [12]:
model = Word2Vec(documents, min_count=1)

## Inter-word relationships

Now use the vector nature of the words to try to understand parts of their meaning

As a reference: top 10 words per topic from 4-topic NMF


Topic #0:
shedy chedy qokeedy qokain qokedy ol qokeey qol qokal shey

Topic #1:
daiin chol chor chy shol cthy sho shor dain cthor

Topic #2:
aiin ar al or okaiin okar chdy otaiin chedy qokaiin

Topic #3:
okeol cheol ol qokeol or cheor daiin chol okeey ckhey



In [27]:
model.wv.most_similar(positive="okeol")

[('dal', 0.9992519021034241),
 ('chol', 0.9992234110832214),
 ('air', 0.9992092251777649),
 ('chedy', 0.9992061257362366),
 ('qoty', 0.9992053508758545),
 ('shedy', 0.9992044568061829),
 ('sheedy', 0.9992016553878784),
 ('shol', 0.9992011785507202),
 ('cthy', 0.9991951584815979),
 ('chey', 0.9991881847381592)]