In [1]:
import numpy as np
import pandas as pd

In [2]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [7]:
import gensim
import os

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sneha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []
for filename in os.listdir('data'):
# total 5 files, loop will run 5 times

    f = open(os.path.join('data',filename))
    corpus = f.read()                # saving book content in corpus
    raw_sent = sent_tokenize(corpus) # using nltk library, break down into sentence, all sentences are separate now.
    for sent in raw_sent:
        story.append(simple_preprocess(sent)) # each sentence is preprocess, 

# all the basic level preprocessing is done using simple_preprocess 
# preprocess sentence is store in story list

In [10]:
len(story) # this number of sentences are present in story list.

141218

In [11]:
story 
# all sentences present in 5 books are separated and also tokenized into words

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [12]:
# build model using class word2vec of gensim

model = gensim.models.Word2Vec(window=10, min_count=2)

# window=10 : when you predict a center word, it'll have 10-10 words on each side of center word(sliding)
# min_count=2 : we will only consider those sentences which atleast have 2 words in it
# more parameters : shift+tab

In [13]:
# building vocab. Corpus has many words. we will select unique words from corpus

model.build_vocab(story)

In [14]:
# train deep learning model
# total_examples : to find how much sentences are there in your corpus
# epochs : during training for how many times your model will iterate over your data
# by default model.epochs value is 5

model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569070, 8628190)

In [15]:
# model is ready. Access using model.wv.
# all fuctions you can access. first I use most_similar function.
# here it shows most similar words to daenerys

model.wv.most_similar('daenerys')

[('stormborn', 0.8144954442977905),
 ('targaryen', 0.7614099979400635),
 ('unburnt', 0.7271187901496887),
 ('myrcella', 0.693175733089447),
 ('queen', 0.686672031879425),
 ('princess', 0.681389331817627),
 ('elia', 0.6789562702178955),
 ('viserys', 0.6634717583656311),
 ('aegon', 0.6633847951889038),
 ('dorne', 0.661825954914093)]

In [16]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [17]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [18]:
# To find vector representation of particular word

model.wv['king']

array([ 2.022627  , -0.03708749,  1.145997  ,  2.1004078 ,  0.84360707,
       -0.92127836,  0.16641675, -0.01347464, -2.518865  , -0.6986408 ,
       -2.77678   , -1.4911073 , -0.39890406,  1.7825683 , -2.3638637 ,
       -1.1565276 ,  0.11370137,  3.5972626 ,  0.36474398, -1.1245729 ,
        1.5514662 , -1.0234716 ,  2.6543298 , -4.0621905 , -1.3367987 ,
        0.9916325 , -1.3360562 , -1.2725242 , -0.2821383 ,  0.786778  ,
       -1.2658575 ,  0.412625  , -0.23377796, -0.5059438 ,  2.5729864 ,
       -3.0197918 , -2.8407254 , -1.1023012 ,  0.08244011, -1.9786863 ,
        0.01910241,  3.2192888 ,  3.7488947 ,  0.20269243, -1.3476076 ,
       -2.5791118 , -1.6357596 , -2.8446178 ,  3.0902612 , -3.5562282 ,
       -2.49992   , -1.8137956 , -0.9144871 , -4.183671  ,  1.4912323 ,
       -1.519677  , -0.9494918 ,  0.6004158 ,  0.50459677,  0.5012629 ,
        0.9846995 ,  0.6455312 ,  0.3701213 ,  1.9983258 ,  1.870055  ,
        2.2927952 , -0.87019694,  0.13080724,  0.34864926, -1.52

In [19]:
model.wv['jon']

array([-0.26247275, -0.46608543,  0.8524512 , -0.8167858 ,  0.48249748,
       -0.68307287,  1.4590575 ,  0.07224257, -1.4874347 ,  0.8249291 ,
       -1.7628851 ,  0.32333663, -0.36647925, -2.768301  ,  1.1550682 ,
       -0.76398015,  0.46253008,  3.8293843 ,  0.9246001 , -1.1513034 ,
        2.0343883 , -2.470776  ,  0.2232914 ,  1.2069292 ,  0.48083198,
       -1.3290635 ,  1.3355424 ,  0.23238714,  0.9089875 ,  1.091144  ,
       -1.3345406 , -0.7733807 ,  0.5208633 , -0.9631541 ,  0.21517256,
        0.30935213,  1.2438072 , -0.33726856,  0.77777785, -1.2862674 ,
       -0.8699673 , -0.24611042,  0.4077397 ,  2.1415527 ,  0.85346156,
       -2.7655485 , -0.6891992 ,  1.7164605 , -1.3621253 ,  0.50444233,
        0.2894098 , -0.11982711,  1.364425  , -0.6415811 , -0.08748776,
        0.15416932, -1.0478072 , -1.5345087 ,  0.25921282, -0.10215252,
        0.7576959 ,  2.4126766 ,  0.6636537 , -0.47829044,  1.3369703 ,
       -0.2886209 ,  0.7930967 ,  2.95161   , -0.33884725, -0.08

In [20]:
model.wv['king'].shape
# we set it for 100 dimension

(100,)

In [21]:
model.wv.similarity('arya','sansa')
# they are sisters, hence close to 1.

0.8549458

In [22]:
model.wv.similarity('cersei','sansa')

0.7532697

In [23]:
model.wv.similarity('tywin','sansa')

0.27307904

In [24]:
model.wv.get_normed_vectors()

# vector representation of all words

array([[-0.1044388 ,  0.00257331,  0.06193685, ..., -0.09552976,
        -0.00624166,  0.15371986],
       [-0.18213543, -0.09231878,  0.0806964 , ...,  0.03754071,
         0.0501276 ,  0.07547738],
       [ 0.1463084 , -0.04639117, -0.16165575, ..., -0.07639765,
         0.09656918, -0.10635211],
       ...,
       [ 0.05390099,  0.09762055, -0.01368601, ..., -0.12176894,
         0.1313503 , -0.173039  ],
       [-0.0466268 ,  0.10975143,  0.11980242, ..., -0.00377262,
         0.15533133, -0.11057276],
       [-0.04904807,  0.07648379,  0.11504773, ...,  0.02561006,
         0.04642635, -0.06773908]], dtype=float32)

In [25]:
model.wv.get_normed_vectors().shape

# total words in 100 dimension

(17453, 100)

In [27]:
y = model.wv.index_to_key
y
# I have not removed stop words

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'her',
 'in',
 'it',
 'had',
 'that',
 'she',
 'as',
 'with',
 'him',
 'not',
 'but',
 'for',
 'they',
 'is',
 'at',
 'on',
 'said',
 'my',
 'have',
 'be',
 'lord',
 'them',
 'no',
 'from',
 'would',
 'were',
 'me',
 'your',
 'one',
 'all',
 'when',
 'will',
 'ser',
 'if',
 'so',
 'their',
 'we',
 'could',
 'are',
 'man',
 'there',
 'this',
 'up',
 'been',
 'what',
 'did',
 'by',
 'king',
 'do',
 'men',
 'back',
 'out',
 'more',
 'or',
 'who',
 'down',
 'well',
 'than',
 'only',
 'like',
 'jon',
 'some',
 'father',
 'old',
 'hand',
 'even',
 'too',
 'tyrion',
 'before',
 'never',
 'an',
 'off',
 'see',
 'know',
 'into',
 'made',
 'now',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'then',
 'how',
 'long',
 'has',
 'can',
 'might',
 'us',
 'come',
 'where',
 'here',
 'through',
 'still',
 'face',
 'head',
 'red',
 'll',
 'way',
 'boy',
 'page',
 'must',
 'once',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 

In [28]:
from sklearn.decomposition import PCA

In [29]:
pca = PCA(n_components=3) # i want in 3-D

In [30]:
X = pca.fit_transform(model.wv.get_normed_vectors()) # pass 2-D array

In [32]:
X[:5]

array([[-0.14933008,  0.60817575, -0.08322281],
       [-0.1609564 ,  0.34718442,  0.03195129],
       [ 0.28784013,  0.5877403 ,  0.1828868 ],
       [-0.01476305,  0.35351676, -0.13024682],
       [ 0.10175917,  0.55430514,  0.24889937]], dtype=float32)

In [33]:
X.shape  # dimension changed from 300 to 3

(17453, 3)

In [35]:
# Plot 1st frequent 100 words bcoz machine will not plot all 17453 words 
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()

In [None]:
# you can see similar word are closer to each other. It will be interesting after removing stopwords.