In [15]:
from spacy.lang.en.stop_words import STOP_WORDS
import gensim, re
import numpy as np
from scipy.spatial.distance import cosine

In [4]:
w2v = gensim.models.KeyedVectors.load_word2vec_format("bin\\GoogleNews-vectors-negative300-SLIM.bin", binary=True)

In [1]:
def clean(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"[\W\d]", " ", text)
    text = re.sub(r"\s+", " ", text)
    for word in STOP_WORDS:
        text = text.replace(" " + word + " ", " ")
    return text

In [95]:
def getAverageVector(text, w2v=w2v):
    words = clean(text).split()
    vectors = []
    for word in words:
        try:
            vectors.append(w2v[word])
        except:
            continue
    return np.mean(vectors, axis=0)

In [63]:
v1 = getAverageVector("Nicole was a nursing aide at a nearby nursing home and a student.")

In [68]:
v2 = getAverageVector("She worked full time at the home, but still managed do well in school. Halfway through the semester, her car broke down on the way to school. It became clear she wouldn't have another way to get around.")

In [69]:
v3 = getAverageVector("For a semester she had been studying for a CMA in English. One day she dreamed of writing an article about CMA in her language. Nicole spent a few weeks studying the subject.")

In [70]:
1 - cosine(v1, v2)

0.6113440990447998

In [71]:
1 - cosine(v1, v3)

0.45953622460365295

## GOT Word2Vec

In [72]:
from __future__ import absolute_import, division, print_function

In [73]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [74]:

import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [77]:
book_filenames = sorted(glob.glob("data/got/*.txt"))

In [79]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/got\got1.txt'...
Corpus is now 1588987 characters long

Reading 'data/got\got2.txt'...
Corpus is now 3325343 characters long

Reading 'data/got\got3.txt'...
Corpus is now 5575001 characters long

Reading 'data/got\got4.txt'...
Corpus is now 7287349 characters long

Reading 'data/got\got5.txt'...
Corpus is now 9542994 characters long



In [80]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [81]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [82]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [83]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [84]:

print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

“We have no business with the dead.”

“Are they dead?” Royce asked softly.
['We', 'have', 'no', 'business', 'with', 'the', 'dead', 'Are', 'they', 'dead', 'Royce', 'asked', 'softly']


In [85]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,789,627 tokens


In [86]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [93]:
thrones2vec = w2v.Word2Vec(sentences, size=num_features, window=5)

In [94]:
if not os.path.exists("trained"):
    os.makedirs("trained")
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

In [111]:
v1 = getAverageVector("When Daenerys had taken the city, they had broken through that same gate with the huge battering ram called Joso’s Cock, made from the mast of a ship. The Great Masters and their slave soldiers had met the attackers here, and the fighting had raged through the surrounding streets for hours. By the time the city finally fell, hundreds of dead and dying had littered the square. Now once again the market was a scene of carnage, though these dead came riding the pale mare.", w2v=thrones2vec)

  


In [112]:
v2 = getAverageVector("Power does not always sit here, no longer, Barristan the Bold, Lord of the Trident as well as the Small Hall? Perhaps our King will ask Joffrey to undo some of these misadventures, and for that treason. Loras Tyrell had been a well rounded man, if true, as fierce as his brother himself, but he was well worth noting for his sisters. His sisters had been one sister never once learned how to govern or a second not so dearly loved.", w2v=thrones2vec)

  


In [113]:
1 - cosine(v1, v2)

0.42041879892349243