In [1]:
import numpy as np
import pandas as pd

In [8]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [2]:
import gensim
import os

**Building the corpus of sentences**

In [4]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

story = []
stop_words = set(stopwords.words('english'))

for filename in os.listdir('data'):
    
    f = open(os.path.join('data',filename))
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        filtered_vocab = [word for word in word_tokenize(sent) if word.lower() not in stop_words]
        filtered_sent = ' '.join(filtered_vocab)
        story.append(simple_preprocess(filtered_sent))  #creates a list of filtered_sentence elements
    

In [8]:
story[:20]

[['game',
  'thrones',
  'book',
  'one',
  'song',
  'ice',
  'fire',
  'george',
  'martin',
  'prologue',
  'start',
  'back',
  'gared',
  'urged',
  'woods',
  'began',
  'grow',
  'dark',
  'around'],
 ['wildlings', 'dead'],
 ['dead', 'frighten'],
 ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile'],
 ['gared', 'rise', 'bait'],
 ['old', 'man', 'past', 'fifty', 'seen', 'lordlings', 'come', 'go'],
 ['dead', 'dead', 'said'],
 ['business', 'dead'],
 ['dead'],
 ['royce', 'asked', 'softly'],
 ['proof'],
 ['saw', 'gared', 'said'],
 ['says', 'dead', 'proof', 'enough'],
 ['known', 'would', 'drag', 'quarrel', 'sooner', 'later'],
 ['wished', 'later', 'rather', 'sooner'],
 ['mother', 'told', 'dead', 'men', 'sing', 'songs', 'put'],
 ['wet', 'nurse', 'said', 'thing', 'royce', 'replied'],
 ['never', 'believe', 'anything', 'hear', 'woman', 'tit'],
 ['things', 'learned', 'even', 'dead'],
 ['voice', 'echoed', 'loud', 'twilit', 'forest']]

In [10]:
#Vectorization into a 100d space
model = gensim.models.Word2Vec(
    window=10,      #10-gram window
    min_count=2     #sentences with at lest 2 words
)

**Extracting unique words**

In [14]:
model.build_vocab(story)

**Training**

In [15]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(4434515, 4616340)

In [16]:
model.wv.most_similar('daenerys')

[('stormborn', 0.8911494016647339),
 ('unburnt', 0.8871327042579651),
 ('rhaella', 0.8391223549842834),
 ('viserys', 0.8156229257583618),
 ('targaryen', 0.8031334280967712),
 ('regent', 0.8017124533653259),
 ('myrcella', 0.7973771095275879),
 ('court', 0.795520544052124),
 ('consort', 0.7900921106338501),
 ('elia', 0.7897936105728149)]

In [11]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [18]:
model.wv['jon']

array([ 0.06087454,  0.22472289, -1.5428451 , -0.09798234,  0.34599316,
       -1.1967156 ,  1.0271225 , -1.2181706 ,  0.28867447,  0.6482783 ,
       -1.5667027 , -1.0073705 , -1.1044483 ,  0.08613513, -0.25862864,
       -0.0931436 ,  1.0841956 , -0.41967374,  0.36977082,  0.98510957,
       -0.47680908,  0.4307621 , -0.01803811, -0.56367236, -1.7331791 ,
       -1.4686623 , -0.27115566, -1.8720524 ,  0.5501677 ,  1.7635877 ,
        1.1755773 ,  0.09396222, -0.54074734,  0.66806984, -1.3330129 ,
       -1.000873  ,  0.30844724, -0.09754684,  0.1600971 , -0.1819655 ,
        1.21667   ,  1.7139126 ,  0.47280842,  0.4531746 ,  1.1852827 ,
       -1.33135   ,  1.5246102 ,  0.54882437, -0.7836798 , -0.68735236,
        0.7709872 , -0.11376972, -1.7854091 ,  0.6651417 , -0.35651174,
       -0.9539841 ,  1.6397679 ,  2.0584826 , -1.3922762 , -0.8055247 ,
        1.4299219 ,  0.44587424, -0.75625277, -1.2232834 ,  1.2494426 ,
       -2.0433164 ,  1.0256153 ,  1.6725074 ,  0.57744056,  0.66

In [14]:
model.wv.similarity('arya','sansa')

0.8534194

In [19]:
model.wv.similarity('winterfell','stark')

0.84129274

In [23]:
model.wv.similarity('lannister','rob')

0.33805794

In [88]:
model.wv.get_normed_vectors()   #shape-> (17418, 100)

array([[-0.11974421, -0.18864349,  0.20628448, ..., -0.11863696,
         0.00497017,  0.02195935],
       [-0.25077537, -0.03254947,  0.22424528, ..., -0.10946808,
         0.02671707,  0.08537447],
       [ 0.06232982, -0.02960947, -0.04660568, ..., -0.09887758,
         0.09963721, -0.08160619],
       ...,
       [-0.03013285,  0.04750119, -0.01497496, ...,  0.00407487,
         0.01391101,  0.05480921],
       [-0.00955141,  0.07727367,  0.11235627, ..., -0.03055063,
        -0.00149143, -0.13749485],
       [-0.04895716,  0.05263472,  0.12851088, ..., -0.05369092,
         0.09577035, -0.07121768]], dtype=float32)

In [24]:
y = model.wv.index_to_key

In [26]:
print(len(y))
y   #listing unique words

17418


['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'king',
 'men',
 'back',
 'well',
 'like',
 'jon',
 'father',
 'old',
 'hand',
 'even',
 'tyrion',
 'never',
 'see',
 'know',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'long',
 'might',
 'us',
 'come',
 'still',
 'face',
 'head',
 'red',
 'way',
 'boy',
 'page',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'three',
 'away',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'day',
 'white',
 'jaime',
 'first',
 'look',
 'want',
 'much',
 'enough',
 'sword',
 'tell',
 'girl',
 'bran',
 'great',
 'looked',
 'll',
 'left',
 'knew',
 'asked',
 'gave',
 'maester',
 'called',
 'wall',
 'every',
 'heard',
 'sansa',
 'let',
 'yet',
 'went',
 'turned',
 'dany',
 'need',
 'behind',
 'around',
 'woman',
 'another',
 'snow',
 'beneath',
 'across',
 'knight',
 'keep',
 'grace',
 'found',
 'gold',
 'last',
 'cersei',
 'cast

**Scatter Plot for vector similarity** 

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA(n_components=3)

In [29]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [30]:
X.shape

(17418, 3)

In [33]:
import plotly.express as px
fig = px.scatter_3d(X[:300],x=0,y=1,z=2, color=y[:300])
fig.show()