In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize

In [2]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline



In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sanket/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/sanket/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [5]:
from nltk.corpus import gutenberg

In [6]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [7]:
len(gutenberg.fileids())

18

In [8]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [9]:
gberg_sent_tokens[:5]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.']

In [10]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [11]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [12]:
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[:5]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.']]

In [14]:
gberg_sents[4][14]

'father'

In [15]:
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [16]:
len(gutenberg.words())

2621613

## Word2Vec

In [17]:
model = Word2Vec(sentences=gberg_sents, vector_size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

In [18]:
model.save('raw_gutenberg_model.bin')

In [19]:
model2 = Word2Vec.load('raw_gutenberg_model.bin')

In [20]:
model.wv["dog"]

array([ 0.04308291, -0.88332427,  0.30545172,  0.01344383, -0.15514964,
       -0.2414906 , -0.10370884, -0.49422938, -0.38026848, -0.00675001,
       -0.09035981,  0.13612452,  0.4108681 ,  0.22404552, -0.19078241,
       -0.10534835, -0.4151423 ,  0.16586553, -0.4165762 ,  0.42116627,
       -0.03468144,  0.14763544, -0.17388079, -0.08702958,  0.2659286 ,
       -0.20281482, -0.32162288, -0.27749795, -0.0338315 ,  0.54274124,
        0.3857084 ,  0.0426717 , -0.03189872,  0.209965  ,  0.40158767,
        0.04172629,  0.6121177 , -0.03893439, -0.31028312, -0.07631712,
       -0.18862633, -0.03252225, -0.1630061 , -0.11277225, -0.08722199,
        0.40430596, -0.15216011, -0.39983755, -0.44553995,  0.0601737 ,
        0.29931095, -0.35798052, -0.06812852,  0.01255601, -0.321568  ,
        0.01929178,  0.05318621, -0.5120343 ,  0.00130603, -0.38049582,
        0.44086882, -0.3901628 , -0.21835679,  0.6013346 ], dtype=float32)

In [21]:
len(model.wv["god"])

64

In [22]:
model.wv.most_similar('dog')

[('puppy', 0.8003432750701904),
 ('sweeper', 0.7644944190979004),
 ('thief', 0.7588803768157959),
 ('cage', 0.755757749080658),
 ('pig', 0.7504035234451294),
 ('paw', 0.7481651306152344),
 ('fox', 0.7464107871055603),
 ('lazy', 0.7455595135688782),
 ('Truck', 0.7262076139450073),
 ('chimney', 0.7260715961456299)]

In [23]:
model.wv.most_similar('think')

[('suppose', 0.8685281872749329),
 ('contradict', 0.8530166149139404),
 ('manage', 0.8437581062316895),
 ('downright', 0.8405541181564331),
 ('know', 0.8235164880752563),
 ('_that_', 0.8165451884269714),
 ('hesitate', 0.816486120223999),
 ('_you_', 0.8151754140853882),
 ('really', 0.8122444152832031),
 ('Dreyfus', 0.8080966472625732)]

In [24]:
model.wv.most_similar('mother')

[('father', 0.8569563031196594),
 ('sister', 0.8548628687858582),
 ('daughter', 0.8015209436416626),
 ('husband', 0.7756421566009521),
 ('child', 0.7708503007888794),
 ('Mary', 0.7552725076675415),
 ('wife', 0.7458688616752625),
 ('brother', 0.7419754862785339),
 ('Amnon', 0.7308648228645325),
 ('nurse', 0.7199915051460266)]

In [25]:
model.wv.doesnt_match("mother father daughter dog".split())

'dog'

In [26]:
model.wv.similarity('father', 'dog')

0.46662968

In [27]:
model.wv.most_similar(positive=['father', 'woman'], negative='man')

[('sister', 0.3589209318161011),
 ('husband', 0.34776216745376587),
 ('Rachel', 0.32448312640190125),
 ('brother', 0.32285967469215393),
 ('wife', 0.3154764771461487),
 ('Rebekah', 0.2971642017364502),
 ('younger', 0.2962048351764679),
 ('Lot', 0.2902721166610718),
 ('separation', 0.28255248069763184),
 ('uncleanness', 0.28077250719070435)]

In [28]:
model.wv.most_similar(positive=['son', 'woman'], negative='man')

[('Rachel', 0.32602670788764954),
 ('Lot', 0.31861168146133423),
 ('Rebekah', 0.317126601934433),
 ('Abram', 0.30485013127326965),
 ('wife', 0.30201753973960876),
 ('brother', 0.2922000586986542),
 ('conceived', 0.28536325693130493),
 ('Cain', 0.28108659386634827),
 ('Ephron', 0.2809517979621887),
 ('Laban', 0.2781221270561218)]

In [29]:
model.wv.most_similar(positive=['king', 'woman'], negative='man', topn=50)

[('Lot', 0.30454468727111816),
 ('Rachel', 0.29852616786956787),
 ('Abel', 0.2962762415409088),
 ('Cain', 0.29079726338386536),
 ('strengthened', 0.2875920534133911),
 ('Mamre', 0.27353915572166443),
 ('multitude', 0.2732914388179779),
 ('Ephron', 0.2715042233467102),
 ('Rebekah', 0.26868724822998047),
 ('Laban', 0.26385077834129333),
 ('Gerar', 0.2598629593849182),
 ('David', 0.25865644216537476),
 ('servants', 0.25752905011177063),
 ('Abimelech', 0.25572967529296875),
 ('Abram', 0.2551155090332031),
 ('tent', 0.2538321018218994),
 ('city', 0.24838575720787048),
 ('Nebuchadnezzar', 0.24480150640010834),
 ('manner', 0.2444269210100174),
 ('uncleanness', 0.24351245164871216),
 ('princes', 0.24313604831695557),
 ('separation', 0.24038681387901306),
 ('prevailed', 0.23809851706027985),
 ('Samaria', 0.23650099337100983),
 ('blessings', 0.23574760556221008),
 ('Sarah', 0.23556561768054962),
 ('Jerusalem', 0.22712036967277527),
 ('Sodom', 0.22697138786315918),
 ('restored', 0.226054877042770

In [30]:
## REDUCE WORD VECTOR DIMENSIONALITY WITH t-SNE

len(model.wv)

17011

In [31]:
X = model.wv[model.wv.key_to_index]

In [32]:
tsne = TSNE(n_components=2, n_iter=1000)

In [None]:
X_2d = tsne.fit_transform(X)

In [None]:
coords_df = pd.DataFrame(X_2d, columns=['x', 'y'])
coords_df['token'] = model.wv.key_to_index

In [None]:
coords_df.head(30)

In [None]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12, 12), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook()

In [None]:
subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=600, plot_height=600)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)