In [1]:
import gensim
from gensim.models import Word2Vec
import codecs
import nltk
import numpy as np
import re
import string

from sklearn.manifold import TSNE

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

## Text Import/Cleaning/Training

In [2]:
def preprocess(text):
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    return text.lower().strip()


def tokenize(text):
    text = [preprocess(sent) for sent in nltk.sent_tokenize(text)]
    tokenized = [nltk.word_tokenize(sent) for sent in text]
    return tokenized


def train_w2v(text):
    return Word2Vec(text, min_count=10, window=3, size=200)

encoding error: https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s

In [3]:
hobbit = codecs.open('TheHobbit.txt', "r", encoding='windows-1251').read()
text = tokenize(hobbit)
model = train_w2v(text)

## Save the model

In [None]:
model.save('hobbit_model')

In [None]:
hobbit_model = gensim.models.Word2Vec.load('hobbit_model')

## Exploring the Model

In [None]:
embeddings = [hobbit_model.wv[word] for word in list(hobbit_model.wv.vocab)]

In [None]:
len(hobbit_model.wv.vocab)

In [None]:
hobbit_model.wv.most_similar('hobbit', topn=3)

In [None]:
hobbit_model.wv.most_similar('mordor', topn=3)

In [None]:
hobbit_model.wv.most_similar('sauron', topn=3)

In [None]:
hobbit_model.wv.most_similar('legolas', topn=3)

## Plotting
- Dimensionality reduction using t-SNE

### 2D

In [None]:
tsne_2d = TSNE(n_components=2, init='pca', n_iter=3000)
embeddings_2d = tsne_2d.fit_transform(embeddings)

In [None]:
for i in range(len(embeddings_2d)):
    trace0 = go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

data = [trace0]
layout = go.Layout(
    title='2D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='2d-scatter')
plot(fig, filename='hobbit-scatter-2d.html')

### 3D

In [None]:
tsne_3d = TSNE(n_components=3, init='pca', n_iter=3000)
embeddings_3d = tsne_3d.fit_transform(embeddings)

In [None]:
for i in range(len(embeddings_3d)):
    trace0 = go.Scatter3d(
        x=embeddings_3d[:, 0],
        y=embeddings_3d[:, 1],
        z=embeddings_3d[:, 2],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

data = [trace0]
layout = go.Layout(
    title='3D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='3d-scatter')
plot(fig, filename='hobbit-scatter-3d.html')

## Topic Modeling

________

# Bonus material that I'd like to dig further into

## Text Generation
- `textgenrnn` is a Python module for creating Character-Level RNNs
- https://github.com/minimaxir/textgenrnn

In [2]:
from textgenrnn import textgenrnn

In [4]:
textgen = textgenrnn()
textgen.train_from_file('TheHobbit.txt', num_epochs=1)
textgen.generate()

38,474 texts collected.
Training on 3,305,502 character sequences.
Epoch 1/1
####################
Temperature: 0.2
####################
     "Now                                  |                                                                                                                                                                                                                                                               





####################
Temperature: 0.5
####################






####################
Temperature: 1.0
####################


    'I  move the Orcs to them behold dinger would go to the talk  back himself, so far row and blew of the stone did not find in which tumnling as memory watcher-twine   and  wonder. Their cutturys tloads.    Bilbo was no mind to look quuting       to  the poor escape by the fact conceintily, at th

     "On water tales telling through by one of the Lord of the trees? White is we could spept again in stern way would    make my ba