In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs

In [3]:
import glob

In [4]:
import multiprocessing

In [5]:
import os

In [6]:
import pprint

In [7]:
import re

In [8]:
import nltk

In [9]:
import gensim.models.word2vec as w2v



In [10]:
import sklearn.manifold

In [11]:
import numpy as np

In [12]:
import matplotlib.pyplot as plt

In [13]:
import pandas as pd

In [14]:
import seaborn as sns

In [15]:
#pre-process data
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prakh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prakh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
book_filenames = sorted(glob.glob("*.txt"))

In [17]:
book_filenames

['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt']

In [18]:
corpus_raw = u""
for book_filename in book_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book:
        corpus_raw += book.read()
print(len(corpus_raw))

9719485


In [19]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [20]:
raw_sent = tokenizer.tokenize(corpus_raw)

In [21]:
def sent_to_word_list(corpus):
    cleaned_corpus = re.sub("[^a-zA-Z]", " ", corpus)
    words = cleaned_corpus.split(" ")
    return words

In [22]:
print(sent_to_word_list(raw_sent[1001]))

['Why', 'not', '', '', '', 'He', 'had', 'been', 'the', 'champion', 'in', 'her', 'father', 's', 'tourney', '', 'Sansa', 'remembered', '']


In [23]:
corpus = []
for sent in raw_sent:
    corpus.append(sent_to_word_list(sent))
print(len(corpus))

128868


In [24]:
num_tokens = 0
for sent in corpus:
    num_tokens += len(sent)
print(num_tokens)

2257390


In [25]:
#3 main tasks vectors help with
#Distance, Similarity and Ranking
num_feat = 300
min_word_count = 3 #not n-gram
num_processors = multiprocessing.cpu_count()
context_window = 7
downsampling = 1e-3
seed = 1 #for random number generator 

In [26]:
corpus2vec = w2v.Word2Vec(sg = 1, seed = seed, workers = num_processors, size = num_feat, min_count = min_word_count, window = context_window, sample = downsampling)

In [27]:
corpus2vec.build_vocab(corpus)
len(corpus2vec.wv.vocab)

17278

In [28]:
corpus2vec.train(corpus, total_examples=corpus2vec.corpus_count, epochs=corpus2vec.iter)

7411331

In [29]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [30]:
corpus2vec.save(os.path.join("trained", "corpus2vec.w2v"))
corpus2vec = w2v.Word2Vec.load(os.path.join("trained", "corpus2vec.w2v"))

In [31]:
tsne = sklearn.manifold.TSNE(n_components = 2, random_state = 0)

In [32]:
full_matrix = corpus2vec.wv.syn0

In [33]:
twoD_matrix = tsne.fit_transform(full_matrix)

In [34]:
#plot point in 2d space
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, twoD_matrix[corpus2vec.wv.vocab[word].index])
            for word in corpus2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [35]:
points.head(10)

Unnamed: 0,word,x,y
0,,3.2404,2.943407
1,Lommy,4.374411,3.070439
2,mushroom,-1.387197,0.258667
3,deepest,0.669777,0.212152
4,kingsroad,1.702896,-2.462437
5,Blackmont,-4.693298,0.53267
6,swelling,0.473838,-3.745332
7,hills,-2.767375,-1.177293
8,carcass,-1.775001,-4.888127
9,CLEFTJAW,1.165294,-3.912796


In [36]:
sns.set_context("poster")

In [37]:
points.plot.scatter("x", "y", s = 10, figsize = (20, 12))

<matplotlib.axes._subplots.AxesSubplot at 0x1fadf3ce080>

In [38]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [39]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))

In [43]:
print(corpus2vec.most_similar("Stark"))
#distance, similarity, and ranking
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = corpus2vec.most_similar_cosmul(positive=[end2, start1], negative=[end1])
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")

[('Eddard', 0.7285761833190918), ('Winterfell', 0.6624001264572144), ('Lyanna', 0.6306412220001221), ('Robb', 0.6157351732254028), ('executed', 0.6148867607116699), ('beheaded', 0.6059194803237915), ('Benjen', 0.6058331727981567), ('direwolf', 0.5974401235580444), ('Arryn', 0.5928331613540649), ('Karstark', 0.5907615423202515)]
Stark is related to Winterfell, as Tully is related to Riverrun


'Tully'