In [1]:
import numpy as np
import pandas as pd
import gensim
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import plotly.express as px

In [2]:
story = []
for filename in os.listdir("Datasets/Game Of Thrones books"): 
    f = open(os.path.join("Datasets/Game Of Thrones books",filename))
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))
    

In [3]:
story

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [4]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [5]:
model.build_vocab(story)

In [6]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569082, 8628190)

In [7]:
model.wv.most_similar('daenerys')

[('stormborn', 0.7998933792114258),
 ('princess', 0.7744178771972656),
 ('targaryen', 0.7708641290664673),
 ('myrcella', 0.7568118572235107),
 ('viserys', 0.7109680771827698),
 ('unburnt', 0.7051169276237488),
 ('queen', 0.6854814887046814),
 ('elia', 0.6821612119674683),
 ('aegon', 0.6748898029327393),
 ('margaery', 0.6689891219139099)]

In [8]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [9]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [10]:
model.wv['king']

array([ 0.75957435, -0.5176172 ,  1.138561  ,  3.0363498 , -1.2611032 ,
       -1.0307747 ,  0.53009844,  1.2129436 , -2.6335092 , -0.9850311 ,
        0.27925426, -0.36904165,  0.62802863,  2.1681445 , -2.0196147 ,
       -3.1661859 , -0.20905295,  1.9171811 , -0.2865097 , -0.6362083 ,
        3.252213  , -0.42798415,  1.9832706 , -3.6155283 , -0.77009076,
        1.7763425 , -1.5853542 , -0.96920973,  0.42264602,  1.632325  ,
       -2.5222635 , -0.1789489 , -0.21275647, -0.542991  ,  3.2288945 ,
       -4.6560555 , -2.7495098 , -0.564501  , -0.44251123, -2.001969  ,
       -0.26024747,  2.1508741 ,  2.6113162 , -0.34503993, -0.48200548,
       -2.2723498 ,  0.54728   , -2.829907  ,  2.712541  , -2.842463  ,
       -2.3553054 , -0.3957262 , -2.4367614 , -3.522782  ,  2.0972955 ,
       -2.2791965 ,  1.8315951 ,  1.2174991 , -0.23502612,  1.6134427 ,
        2.0582633 ,  1.0629565 ,  0.05517822,  0.70292634,  1.3846177 ,
        2.5392306 , -1.9156349 , -1.1403139 ,  0.68454796, -1.98

In [11]:
model.wv.similarity('arya','sansa')

0.8523856

In [12]:
model.wv.similarity('cersei','sansa')

0.7226273

In [13]:
model.wv.similarity('tywin','sansa')

0.26372063

In [14]:
model.wv.get_normed_vectors()

array([[-0.18284383, -0.18147424,  0.04577145, ..., -0.07307348,
         0.06457549,  0.16062191],
       [-0.13115463, -0.19165511,  0.09406695, ..., -0.14812641,
        -0.03274959,  0.04024003],
       [ 0.09557062, -0.02111869, -0.09383608, ..., -0.03525607,
         0.08897435, -0.13980857],
       ...,
       [ 0.02405878,  0.01423461,  0.01277425, ..., -0.07216775,
         0.10923573, -0.0902752 ],
       [-0.0244996 ,  0.11821623,  0.18029748, ...,  0.01117416,
         0.06126912, -0.11111845],
       [-0.05363638,  0.05611462,  0.1011913 , ..., -0.00834552,
         0.08851409, -0.06706765]], dtype=float32)

In [15]:
y = model.wv.index_to_key

In [16]:
y

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'her',
 'in',
 'it',
 'had',
 'that',
 'she',
 'as',
 'with',
 'him',
 'not',
 'but',
 'for',
 'they',
 'is',
 'at',
 'on',
 'said',
 'my',
 'have',
 'be',
 'lord',
 'them',
 'no',
 'from',
 'would',
 'were',
 'me',
 'your',
 'one',
 'all',
 'when',
 'will',
 'ser',
 'if',
 'so',
 'their',
 'we',
 'could',
 'are',
 'man',
 'there',
 'this',
 'up',
 'been',
 'what',
 'did',
 'by',
 'king',
 'do',
 'men',
 'back',
 'out',
 'more',
 'or',
 'who',
 'down',
 'well',
 'than',
 'only',
 'like',
 'jon',
 'some',
 'father',
 'old',
 'hand',
 'even',
 'too',
 'tyrion',
 'before',
 'never',
 'an',
 'off',
 'see',
 'know',
 'into',
 'made',
 'now',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'then',
 'how',
 'long',
 'has',
 'can',
 'might',
 'us',
 'come',
 'where',
 'here',
 'through',
 'still',
 'face',
 'head',
 'red',
 'll',
 'way',
 'boy',
 'page',
 'must',
 'once',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=3)

In [19]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [20]:
X.shape

(17453, 3)

In [21]:
fig = px.scatter_3d(X[:100],x=0,y=1,z=2, color=y[:100])
fig.show()