# Thrones2Vec

## Imports

In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


**Set up logging**

In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

**Download NLTK tokenizer models (only the first time)**

In [6]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/sanju/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sanju/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Prepare Corpus

**Load books from files**

In [7]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [8]:
os.listdir('data')

['got2.txt', 'got4.txt', 'got5.txt', 'got1.txt', 'got3.txt']

In [9]:
print("Found books:")
book_filenames

Found books:


['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

**Combine the books into one string**

In [10]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/got1.txt'...
Corpus is now 1770659 characters long

Reading 'data/got2.txt'...
Corpus is now 4071041 characters long

Reading 'data/got3.txt'...
Corpus is now 6391405 characters long

Reading 'data/got4.txt'...
Corpus is now 8107945 characters long

Reading 'data/got5.txt'...
Corpus is now 9719485 characters long



**Split the corpus into sentences**

In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [13]:
len(raw_sentences)

128868

In [14]:
type(tokenizer)

nltk.tokenize.punkt.PunktSentenceTokenizer

In [15]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [16]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [17]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
['Heraldic', 'crest', 'by', 'Virginia', 'Norey']


In [18]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


## Train Word2Vec

In [19]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [20]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [21]:
thrones2vec.build_vocab(sentences)

2017-11-19 00:04:40,522 : INFO : collecting all words and their counts
2017-11-19 00:04:40,524 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-19 00:04:40,587 : INFO : PROGRESS: at sentence #10000, processed 140984 words, keeping 10280 word types
2017-11-19 00:04:40,646 : INFO : PROGRESS: at sentence #20000, processed 279730 words, keeping 13558 word types
2017-11-19 00:04:40,701 : INFO : PROGRESS: at sentence #30000, processed 420336 words, keeping 16598 word types
2017-11-19 00:04:40,766 : INFO : PROGRESS: at sentence #40000, processed 556581 words, keeping 18324 word types
2017-11-19 00:04:40,826 : INFO : PROGRESS: at sentence #50000, processed 686247 words, keeping 19714 word types
2017-11-19 00:04:40,877 : INFO : PROGRESS: at sentence #60000, processed 828497 words, keeping 21672 word types
2017-11-19 00:04:40,935 : INFO : PROGRESS: at sentence #70000, processed 973830 words, keeping 23093 word types
2017-11-19 00:04:40,994 : INFO : PROGRESS: at 

**Start training, this might take a minute or two...**

In [24]:
thrones2vec.train(sentences, total_examples = thrones2vec.corpus_count, epochs = thrones2vec.iter)

2017-11-19 00:06:04,447 : INFO : training model with 4 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-11-19 00:06:05,548 : INFO : PROGRESS: at 1.85% examples, 120598 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:06,570 : INFO : PROGRESS: at 3.95% examples, 131516 words/s, in_qsize 8, out_qsize 0
2017-11-19 00:06:07,653 : INFO : PROGRESS: at 5.90% examples, 128164 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:08,714 : INFO : PROGRESS: at 8.29% examples, 132602 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:09,810 : INFO : PROGRESS: at 10.39% examples, 134293 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:10,815 : INFO : PROGRESS: at 12.34% examples, 134999 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:11,884 : INFO : PROGRESS: at 14.42% examples, 135235 words/s, in_qsize 7, out_qsize 0
2017-11-19 00:06:13,041 : INFO : PROGRESS: at 16.70% examples, 134115 words/s, in_qsize 6, out_qsize 1
2017-11-19 00:06:14,076 : INFO

7022569

**Save to file, can be useful later**

In [25]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [26]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

2017-11-19 00:07:00,933 : INFO : saving Word2Vec object under trained/thrones2vec.w2v, separately None
2017-11-19 00:07:00,935 : INFO : not storing attribute syn0norm
2017-11-19 00:07:00,937 : INFO : not storing attribute cum_table
2017-11-19 00:07:01,833 : INFO : saved trained/thrones2vec.w2v


### Loading the saved model

In [27]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

2017-11-19 00:07:13,882 : INFO : loading Word2Vec object from trained/thrones2vec.w2v
2017-11-19 00:07:14,226 : INFO : loading wv recursively from trained/thrones2vec.w2v.wv.* with mmap=None
2017-11-19 00:07:14,227 : INFO : setting ignored attribute syn0norm to None
2017-11-19 00:07:14,228 : INFO : setting ignored attribute cum_table to None
2017-11-19 00:07:14,229 : INFO : loaded trained/thrones2vec.w2v


### Explore semantic similarities between book characters

**Words closest to the given word**

In [28]:
thrones2vec.most_similar("Bolton")

2017-11-19 00:07:16,822 : INFO : precomputing L2-norms of word weight vectors


[('Roose', 0.931117057800293),
 ('Ramsay', 0.7579537034034729),
 ('Wyman', 0.7575839757919312),
 ('Karstark', 0.7542030811309814),
 ('Rickard', 0.7527644038200378),
 ('Jonos', 0.7400764226913452),
 ('Walder', 0.7288681268692017),
 ('Blackwood', 0.7237808704376221),
 ('Hornwood', 0.7065834403038025),
 ('Arnolf', 0.6951899528503418)]

In [29]:
thrones2vec.most_similar("Arya")

[('Sansa', 0.7465738654136658),
 ('Brienne', 0.6997377872467041),
 ('Gendry', 0.6868095397949219),
 ('Alayne', 0.6788970828056335),
 ('Marillion', 0.6593056917190552),
 ('Meera', 0.6563273072242737),
 ('Denyo', 0.6470311880111694),
 ('Catelyn', 0.645492672920227),
 ('Squirrel', 0.6442588567733765),
 ('Horseface', 0.6431102156639099)]

In [95]:
thrones2vec.most_similar("Jaime")

[('Kevan', 0.7068551778793335),
 ('Lancel', 0.6879087686538696),
 ('Tyrion', 0.682525634765625),
 ('Cersei', 0.6738083362579346),
 ('Bronn', 0.6667170524597168),
 ('Brienne', 0.6663480997085571),
 ('Kingslayer', 0.6657470464706421),
 ('Joff', 0.6590257287025452),
 ('Loras', 0.6484676599502563),
 ('Peck', 0.6484065055847168)]

**Linear relationships between word pairs** 

In [33]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

## ** Computes A-B+C **

### Capturing the Pet Relation

In [48]:
nearest_similarity_cosmul("Nymeria", "Arya", "Rickon")

Nymeria is related to Arya, as Shaggydog is related to Rickon


'Shaggydog'

### Capturing the sigils and houses  relation

In [63]:
nearest_similarity_cosmul("lion", "Lannister", "Stark")

lion is related to Lannister, as direwolf is related to Stark


'direwolf'

### Caputuring the Places and Houses relation

In [42]:
nearest_similarity_cosmul("Winterfell", "Stark", "Lannister")

Winterfell is related to Stark, as Rock is related to Lannister


'Rock'

### Capturing Mother and Son relation

In [105]:
nearest_similarity_cosmul("Cersei", "Joffrey", "Robb")

Cersei is related to Joffrey, as Catelyn is related to Robb


'Catelyn'

### Capturing father and daughter relation

In [83]:
nearest_similarity_cosmul("Sansa", "Eddard", "Tywin")

Sansa is related to Eddard, as Cersei is related to Tywin


'Cersei'

### Capturing Sister and Brother relation

In [92]:
nearest_similarity_cosmul("Sansa", "Rickon", "Tyrion")

Sansa is related to Rickon, as Cersei is related to Tyrion


'Cersei'

### Capturing Brother and Brother relation

In [94]:
nearest_similarity_cosmul("Bran", "Rickon", "Jaime")

Bran is related to Rickon, as Tyrion is related to Jaime


'Tyrion'

## Though some relations are captured some are really weird.

In [122]:
nearest_similarity_cosmul("Jon", "Ramsay", "Dany")
## meaning Ramsay friendzoned Jon !!

Jon is related to Ramsay, as Jorah is related to Dany


'Jorah'