In [4]:
import gensim
from gensim.models import Word2Vec
import codecs
import nltk
import numpy as np
import re
import string

from sklearn.manifold import TSNE

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

## Text Import/Cleaning/Training

In [7]:
def preprocess(text):
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    return text.lower().strip()


def tokenize(text):
    text = [preprocess(sent) for sent in nltk.sent_tokenize(text)]
    tokenized = [nltk.word_tokenize(sent) for sent in text]
    return tokenized


def train_w2v(text):
    return Word2Vec(text, min_count=10, window=3, size=200)

encoding error: https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s

In [8]:
hobbit = codecs.open('TheHobbit.txt', "r", encoding='windows-1251').read()
text = tokenize(hobbit)
model = train_w2v(text)

## Save the model

In [11]:
model.save('hobbit_model')

In [10]:
hobbit_model = gensim.models.Word2Vec.load('hobbit_model')

## Exploring the Model

In [13]:
len(hobbit_model.wv.vocab)

3571

In [14]:
embeddings = [hobbit_model.wv[word] for word in list(hobbit_model.wv.vocab)]

In [15]:
hobbit_model.wv.most_similar('hobbit', topn=3)

[('dwarf', 0.9155319333076477),
 ('wizard', 0.8913840651512146),
 ('gaffer', 0.845954179763794)]

In [21]:
hobbit_model.wv.most_similar('mordor', topn=3)

[('moria', 0.9295381903648376),
 ('isengard', 0.9287574887275696),
 ('rivendell', 0.9090243577957153)]

In [19]:
hobbit_model.wv.most_similar('sauron', topn=3)

[('smaug', 0.9011658430099487),
 ('danger', 0.8894429206848145),
 ('isildur', 0.8842775225639343)]

## Plotting
- Dimensionality reduction using t-SNE

### 2D

In [22]:
tsne_2d = TSNE(n_components=2, init='pca', n_iter=3000)
embeddings_2d = tsne_2d.fit_transform(embeddings)

In [23]:
for i in range(len(embeddings_2d)):
    trace0 = go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

data = [trace0]
layout = go.Layout(
    title='2D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='2d-scatter')


Consider using IPython.display.IFrame instead



### 3D

In [24]:
tsne_3d = TSNE(n_components=3, init='pca', n_iter=3000)
embeddings_3d = tsne_3d.fit_transform(embeddings)

In [25]:
for i in range(len(embeddings_3d)):
    trace0 = go.Scatter3d(
        x=embeddings_3d[:, 0],
        y=embeddings_3d[:, 1],
        z=embeddings_3d[:, 2],
        mode='markers',
        marker=dict(color='green',
                    size=5),
        text=list(model.wv.vocab))

data = [trace0]
layout = go.Layout(
    title='3D Representation of Word Vectors')


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter')


Consider using IPython.display.IFrame instead

