In [1]:
# Generate word2vec

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

from string import punctuation
import numpy as np

import nltk
from nltk.corpus import gutenberg
from nltk.data import find
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('word2vec_sample')

import gensim
from gensim.models import word2vec

import tensorflow as tf
import tensorboard as tb
from tensorboard.plugins import projector
from torch.utils.tensorboard import SummaryWriter
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
%load_ext tensorboard

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [2]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# get the KJV bible
bible_kjv_sents = gutenberg.sents('bible-kjv.txt') 

[print(bible_kjv_sents[i]) for i in range(10)]
print("\n No. of sentences: ", len(bible_kjv_sents))

['[', 'The', 'King', 'James', 'Bible', ']']
['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible']
['The', 'First', 'Book', 'of', 'Moses', ':', 'Called', 'Genesis']
['1', ':', '1', 'In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.']
['1', ':', '2', 'And', 'the', 'earth', 'was', 'without', 'form', ',', 'and', 'void', ';', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', '.']
['And', 'the', 'Spirit', 'of', 'God', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', '.']
['1', ':', '3', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'light', ':', 'and', 'there', 'was', 'light', '.']
['1', ':', '4', 'And', 'God', 'saw', 'the', 'light', ',', 'that', 'it', 'was', 'good', ':', 'and', 'God', 'divided', 'the', 'light', 'from', 'the', 'darkness', '.']
['1', ':', '5', 'And', 'God', 'called', 'the', 'light', 'Day', ',', 'and', 'the', 'darkness', 'he', 'called', 'Night', '.']
['And', 'the', 'evening', 'and', 'the', 'morning',

In [5]:
# preprocess the dataset
preprocessed = [[word.lower() for word in sent if word not in punctuation] for sent in bible_kjv_sents]

[print(preprocessed[i]) for i in range(10)]
print("\n No. of sentences: ", len(preprocessed))

['the', 'king', 'james', 'bible']
['the', 'old', 'testament', 'of', 'the', 'king', 'james', 'bible']
['the', 'first', 'book', 'of', 'moses', 'called', 'genesis']
['1', '1', 'in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth']
['1', '2', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep']
['and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters']
['1', '3', 'and', 'god', 'said', 'let', 'there', 'be', 'light', 'and', 'there', 'was', 'light']
['1', '4', 'and', 'god', 'saw', 'the', 'light', 'that', 'it', 'was', 'good', 'and', 'god', 'divided', 'the', 'light', 'from', 'the', 'darkness']
['1', '5', 'and', 'god', 'called', 'the', 'light', 'day', 'and', 'the', 'darkness', 'he', 'called', 'night']
['and', 'the', 'evening', 'and', 'the', 'morning', 'were', 'the', 'first', 'day']

 No. of sentences:  30103


In [6]:
# Generate Bible word2vec by gensim
bibleModel = word2vec.Word2Vec(preprocessed, min_count=5, size=64, window=5)
bibleW2V = bibleModel.wv

print("No.of words:", len(bibleW2V.vocab))
print("Model dimensions:", bibleW2V.vector_size)

No.of words: 5429
Model dimensions: 64


What is God ?

In [7]:
bibleW2V.most_similar(["god"])

[('christ', 0.7863929271697998),
 ('lord', 0.7653032541275024),
 ('truth', 0.7368271946907043),
 ('hosts', 0.7231798768043518),
 ('salvation', 0.7206310033798218),
 ('spirit', 0.7202099561691284),
 ('glory', 0.7174791097640991),
 ('righteousness', 0.6971009373664856),
 ('faith', 0.6935108304023743),
 ('grace', 0.6796157360076904)]

What is Jesus ?

In [8]:
bibleW2V.most_similar(["jesus"])

[('moses', 0.7741672992706299),
 ('david', 0.7562395930290222),
 ('joshua', 0.6813719868659973),
 ('saul', 0.6667389273643494),
 ('jeremiah', 0.6530030965805054),
 ('daniel', 0.6524481773376465),
 ('john', 0.6487957835197449),
 ('elijah', 0.6406660676002502),
 ('paul', 0.635783314704895),
 ('prophet', 0.6346136331558228)]

In [9]:
# Write the w2v model to files
vectors = np.array(bibleW2V.vectors)
metadata = bibleW2V.index2word

writer = SummaryWriter('bibleW2V')
writer.add_embedding(vectors, metadata)
writer.close()

In [10]:
# show the embedding projector

# TODO: Need to fix the bug
#projector.visualize_embeddings('bibleW2V', projector.ProjectorConfig())
#%tensorboard --logdir=bibleW2V

In [11]:
# read the pre-trained Google News word2vec
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
newsW2V = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False).wv

print("No.of words:", len(newsW2V.vocab))

#Each word is represented in the space of 300 dimensions:
print("Model dimensions:", newsW2V.vector_size)

No.of words: 43981
Model dimensions: 300


In [12]:
newsW2V.most_similar(positive="university", topn=10)

[('universities', 0.7003918886184692),
 ('faculty', 0.6780906915664673),
 ('undergraduate', 0.6587096452713013),
 ('campus', 0.6434987783432007),
 ('college', 0.638526976108551),
 ('academic', 0.6317198276519775),
 ('professors', 0.6298646926879883),
 ('undergraduates', 0.6149812936782837),
 ('University', 0.6139305233955383),
 ('student', 0.600540041923523)]

In [13]:
newsW2V.doesnt_match('breakfast cereal dinner lunch'.split())

'cereal'

woman + king - man = ?

In [14]:
newsW2V.most_similar(positive=['woman','king'], negative=['man'], topn = 1)

[('queen', 0.7118192911148071)]

Paris + Germany - Berlin = ?

In [15]:
newsW2V.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)

[('France', 0.7884092330932617)]

In [16]:
# show the word vector
word = 'woman'
print(newsW2V.wv[word])
print("len:", len(newsW2V.wv[word]))

[ 9.15657e-02 -2.90510e-02 -3.87959e-02 -4.04507e-02  4.44958e-02
 -4.04507e-02 -4.30248e-02  9.65301e-03  4.21055e-02  1.82948e-02
 -3.65895e-02 -1.29442e-01 -2.37188e-02 -4.70699e-02 -1.02046e-02
  3.54863e-02 -7.06049e-02 -2.01334e-02  2.35350e-02 -1.14917e-02
 -1.09401e-02 -1.81109e-02 -2.07770e-02 -1.53713e-01  3.81524e-03
 -8.75206e-02 -6.43534e-02  9.92881e-02  1.31649e-01 -7.97982e-02
  5.40569e-02 -2.34430e-03 -8.49465e-02 -3.97152e-02 -4.37603e-02
  4.63344e-02  1.15468e-01 -1.83867e-02 -3.60379e-02  7.50177e-02
 -5.92051e-02 -1.05723e-02  5.95729e-02 -9.14738e-03  4.89086e-02
 -3.38315e-02 -2.86832e-02  1.33303e-02 -1.15376e-02  5.73665e-02
  1.97657e-02  6.06186e-04  2.09608e-02  1.48932e-02 -2.90510e-02
 -2.68446e-02 -3.47508e-02 -2.64768e-02  7.64886e-02  5.76882e-03
  1.12527e-01  6.61921e-02 -1.70996e-02  3.58540e-02  1.55368e-02
  2.92348e-02  3.56702e-02  6.32502e-02  7.57532e-02 -2.72123e-02
  6.91339e-02  8.12692e-02 -8.97270e-02  3.95314e-02 -6.25147e-03
  9.04625e