# Importing Libraries

In [57]:
import numpy as np
import pandas as pd
import gensim
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Function to remove stopwords

In [63]:
def remove_stopwords(text):
    tokens = word_tokenize(text.lower())
    english_stopwords = set(stopwords.words('english'))
    tokens_withour_stopwords = [t for t in tokens if t not in english_stopwords]
    filtered_text = ' '.join(tokens_withour_stopwords)
    return filtered_text

# Tokanizing the content of the file along with removing stopwords

In [64]:
story = []

for filename in os.listdir('data'):
    
    f = open(os.path.join('data', filename),'r',encoding='utf-8')
    corpus_raw = f.read()
    corpus_final = remove_stopwords(corpus_raw)
    raw_sent = sent_tokenize(corpus_final)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [65]:
len(story)

163103

In [66]:
story

[['edition',
  'contains',
  'complete',
  'text',
  'original',
  'hardcover',
  'edition'],
 ['one', 'word', 'omitted'],
 ['clash',
  'kings',
  'bantam',
  'spectra',
  'book',
  'publishing',
  'history',
  'bantam',
  'spectra',
  'hardcover',
  'edition',
  'published',
  'february',
  'bantam',
  'spectra',
  'paperback',
  'edition',
  'september',
  'spectra',
  'portrayal',
  'boxed',
  'trademarks',
  'bantam',
  'books',
  'division',
  'random',
  'house',
  'inc',
  'rights',
  'reserved'],
 ['copyright', 'george', 'martin'],
 ['maps', 'james', 'sinclair'],
 ['heraldic', 'crest', 'virginia', 'norey'],
 ['library',
  'congress',
  'catalog',
  'card',
  'number',
  'part',
  'book',
  'may',
  'reproduced',
  'transmitted',
  'form',
  'means',
  'electronic',
  'mechanical',
  'including',
  'photocopying',
  'recording',
  'information',
  'storage',
  'retrieval',
  'system',
  'without',
  'permission',
  'writing',
  'publisher'],
 ['visit',
  'website',
  'www',
  'b

# Model Generation

In [67]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=8,
)

In [68]:
model.build_vocab(story)

In [69]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(4441586, 4621335)

# Model Testing

In [70]:
model.wv.most_similar('jon')

[('surname', 0.7319146394729614),
 ('sam', 0.6791117191314697),
 ('qhorin', 0.6589221954345703),
 ('grenn', 0.649080753326416),
 ('ghost', 0.6475927233695984),
 ('dolorous', 0.6471182703971863),
 ('theon', 0.6432061195373535),
 ('ygritte', 0.6369132399559021),
 ('edd', 0.6347445249557495),
 ('asha', 0.6287349462509155)]

In [71]:
model.wv.doesnt_match(['jon', 'rikon', 'robb', 'arya', 'sansa', 'bran'])

'jon'

In [72]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [73]:
model.wv['jon']

array([ 1.2325127 ,  1.9372822 ,  0.22342661, -0.3756587 ,  1.8213208 ,
       -0.9114719 , -0.16468963,  2.0526843 ,  0.27278563,  0.24097817,
        0.93436235, -1.067016  , -0.09273927,  0.21175374,  1.3054348 ,
        0.27105612,  0.21076109, -0.88997555, -1.8251039 , -1.7673988 ,
        0.22768411, -0.40976453, -0.24395129, -1.4708114 , -1.5821983 ,
       -2.5363455 , -0.09525846,  0.15870453,  0.64542675, -0.60069484,
       -1.0614601 , -2.6947753 ,  1.1666776 ,  0.42161208,  0.10234049,
        0.11625234, -0.4028122 , -0.98606646, -0.7959067 , -0.8056132 ,
        0.15731828, -0.47523162, -0.7846116 ,  1.3641535 ,  0.26233393,
       -0.25001985,  1.8542655 , -0.41020182, -0.07323302,  2.1042426 ,
        1.1265914 , -0.9679709 ,  0.47266087,  1.8382362 , -0.99616325,
        1.2450552 , -0.16445419, -0.24796426, -0.3373431 , -0.5740928 ,
       -0.5170271 ,  0.85053676, -0.29279426, -0.8072891 , -0.95520633,
        1.909959  ,  1.2459009 , -0.51089644,  0.1917265 ,  0.93

In [74]:
model.wv['king'].shape

(100,)

In [75]:
model.wv.similarity('arya', 'sansa')

0.7996057

In [76]:
model.wv.similarity('cersei', 'sansa')

0.74655277

In [77]:
model.wv.similarity('tywin', 'sansa')

0.35562402

In [78]:
model.wv.get_normed_vectors()

array([[ 0.01373875,  0.10173237, -0.04203181, ...,  0.02210657,
        -0.0126978 ,  0.05009589],
       [-0.10500804,  0.12222076,  0.04466277, ..., -0.01072783,
        -0.21978587,  0.01839383],
       [-0.1744469 ,  0.09951742,  0.01357904, ..., -0.14727664,
         0.16898805, -0.14879382],
       ...,
       [-0.07491867,  0.15795586,  0.01631927, ..., -0.10990151,
         0.01453382,  0.03492467],
       [-0.14169778,  0.09457946,  0.06852347, ..., -0.18298796,
        -0.01368346,  0.02725869],
       [-0.26830375,  0.1962293 ,  0.03897475, ..., -0.04099725,
        -0.07019613,  0.05533511]], dtype=float32)

In [79]:
model.wv.get_normed_vectors().shape

(17338, 100)

In [80]:
y = model.wv.index_to_key

In [81]:
y

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'king',
 'men',
 'back',
 'well',
 'jon',
 'like',
 'father',
 'old',
 'hand',
 'tyrion',
 'even',
 'never',
 'see',
 'know',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'long',
 'might',
 'us',
 'come',
 'still',
 'face',
 'head',
 'red',
 'way',
 'boy',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'three',
 'away',
 'son',
 'dead',
 'blood',
 'half',
 'take',
 'go',
 'arya',
 'make',
 'saw',
 'jaime',
 'day',
 'white',
 'first',
 'look',
 'want',
 'enough',
 'much',
 'bran',
 'girl',
 'sword',
 'tell',
 'great',
 'looked',
 'called',
 'left',
 'knew',
 'maester',
 'asked',
 'gave',
 'sansa',
 'wall',
 'every',
 'heard',
 'let',
 'yet',
 'went',
 'turned',
 'dany',
 'need',
 'behind',
 'around',
 'woman',
 'another',
 'beneath',
 'snow',
 'across',
 'knight',
 'keep',
 'gold',
 'grace',
 'found',
 'cersei',
 'last',
 'castle',
 'stark',
 

# Reducing Dimensions of the vector for Visualization

In [82]:
from sklearn.decomposition import PCA

In [83]:
pca = PCA(n_components=3)

In [84]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [85]:
X[:5]

array([[-0.43545353,  0.00755186, -0.24982703],
       [-0.3384117 ,  0.5741135 , -0.39475682],
       [-0.47613046,  0.02344516,  0.3356842 ],
       [ 0.06884101,  0.162747  ,  0.03372049],
       [-0.23347433,  0.5064767 , -0.6255392 ]], dtype=float32)

In [86]:
X.shape

(17338, 3)

# Visualizing the vectors

In [87]:
import plotly.express as px
fig = px.scatter_3d(X[:500], x=0,y=1,z=2, color=y[:500])
fig.show()



