In [1]:
import gensim
from gensim.models import Word2Vec
import codecs
import numpy as np
import re
import string
import nltk
import random

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import seaborn as sns

## Text Import/Cleaning/Training

In [2]:
def preprocess(text):
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    return text.lower().strip()


def tokenize(text):
    text = [preprocess(sent) for sent in nltk.sent_tokenize(text)]
    tokenized = [nltk.word_tokenize(sent) for sent in text]
    return tokenized


def train_w2v(text):
    return Word2Vec(text, min_count=10, window=3, size=200)

encoding error: https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s

In [3]:
hobbit = codecs.open('TheHobbit.txt', "r", encoding='windows-1251').read()
text = tokenize(hobbit)
model = train_w2v(text)

## Save the model

In [6]:
model.save('hobbit_model')


This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



In [7]:
hobbit_model = gensim.models.Word2Vec.load('hobbit_model')


This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



## Exploring the Model

In [8]:
embeddings = [hobbit_model.wv[word] for word in list(hobbit_model.wv.vocab)]

In [9]:
len(hobbit_model.wv.vocab)

3571

In [10]:
hobbit_model.wv.most_similar('hobbit', topn=3)

[('dwarf', 0.9185002446174622),
 ('wizard', 0.8965802788734436),
 ('gaffer', 0.8510836362838745)]

In [11]:
hobbit_model.wv.most_similar('mordor', topn=3)

[('isengard', 0.9292618632316589),
 ('moria', 0.9270439147949219),
 ('rivendell', 0.9078439474105835)]

In [12]:
hobbit_model.wv.most_similar('bilbo', topn=3)

[('gollum', 0.9539322853088379),
 ('strider', 0.9411383271217346),
 ('gandalf', 0.9367592334747314)]

In [13]:
hobbit_model.wv.most_similar('legolas', topn=3)

[('gimli', 0.9775985479354858),
 ('aragorn', 0.9602985978126526),
 ('jomer', 0.9599728584289551)]

## Plotting
- Dimensionality reduction using t-SNE

### 2D

In [None]:
tsne_2d = TSNE(n_components=2, init='pca', n_iter=3000)
embeddings_2d = tsne_2d.fit_transform(embeddings)

In [None]:
for i in range(len(embeddings_2d)):
    trace0 = go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

data = [trace0]
layout = go.Layout(
    title='2D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='2d-scatter')
plot(fig, filename='hobbit-scatter-2d.html')

### 3D

In [None]:
tsne_3d = TSNE(n_components=3, init='pca', n_iter=3000)
embeddings_3d = tsne_3d.fit_transform(embeddings)

In [None]:
#colors = np.array([x for x in sns.color_palette("hls", 10)])

for i in range(len(embeddings_3d)):
    trace0 = go.Scatter3d(
        x=embeddings_3d[:, 0],
        y=embeddings_3d[:, 1],
        z=embeddings_3d[:, 2],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

In [None]:
data = [trace0]
layout = go.Layout(
    title='3D Representation of Word Vectors & Clusters')


fig = go.Figure(data=data, layout=layout)
plot(fig, filename='hobbit-scatter-3d.html')

## sample data

In [14]:
sample = ['bilbo', 'gandalf', 'gollum', 'precious', 'thorin', 'smaug', 'bard', 'beorn',
          'elrond', 'sauron', 'dwarf', 'elf', 'troll',
         'shire', 'rivendell', 'mountains', 'mirkwood', 'esgaroth']

In [24]:
tsne_2d_sample = TSNE(n_components=2, init='pca', n_iter=3000, perplexity=30, random_state=12)
embeddings_sample = [hobbit_model.wv[word] for word in sample]

embeddings_sample_2d = tsne_2d_sample.fit_transform(embeddings_sample)

In [25]:
for i in range(len(embeddings_sample)):
    trace0 = go.Scatter(
        x=embeddings_sample_2d[:, 0],
        y=embeddings_sample_2d[:, 1],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=sample)

In [27]:
data = [trace0]
layout = go.Layout(
    title='2D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='2d-scatter')


Consider using IPython.display.IFrame instead



________

# Bonus material
#### (that I'd like to dig further into)

## Text Generation
- `textgenrnn` is a Python module for creating Character-Level RNNs
- https://github.com/minimaxir/textgenrnn

### results from running 5 epochs on a GPU

In [4]:
from textgenrnn import textgenrnn

Using TensorFlow backend.


In [5]:
textgen = textgenrnn()
textgen.train_from_file('TheHobbit.txt', num_epochs=5)
textgen.generate()

W0621 04:59:28.494442 139667747902848 deprecation_wrapper.py:119] From /opt/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0621 04:59:28.532910 139667747902848 deprecation_wrapper.py:119] From /opt/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0621 04:59:28.542708 139667747902848 deprecation_wrapper.py:119] From /opt/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0621 04:59:28.543475 139667747902848 deprecation_wrapper.py:119] From /opt/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0621 04:59:28.544178 13966

38,474 texts collected.


W0621 04:59:41.711222 139667747902848 deprecation.py:323] From /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Training on 3,305,502 character sequences.
Epoch 1/5
####################
Temperature: 0.2
####################






####################
Temperature: 0.5
####################






####################
Temperature: 1.0
####################






Epoch 2/5
####################
Temperature: 0.2
####################






####################
Temperature: 0.5
####################






####################
Temperature: 1.0
####################
   'Men be scrove was   furier. The merop  straight! All forgethernising in Degath of the  swiftly sworsen had  master? Turning the mispering that  him in time ways and 





Epoch 3/5
####################
Temperature: 0.2
####################






####################
Temperature: 0.5
####################






####################
Temperature: 1.0
####################


West. Dear,' said Thjoden    a screat stoned  of bears come of the seldorn in his weonty. The      grees!'    The forest blasse were hole of the shot about the ponies: had clear