## Word2Vec

In [None]:
import gensim
import os
import kagglehub
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
nltk.download('punkt_tab')

In [None]:
!pip install --upgrade gensim --user



In [None]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

In [None]:
# Download latest version
path = kagglehub.dataset_download("khulasasndh/game-of-thrones-books")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/khulasasndh/game-of-thrones-books?dataset_version_number=1...


100%|██████████| 3.71M/3.71M [00:00<00:00, 107MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/khulasasndh/game-of-thrones-books/versions/1





In [None]:
story = []
for filename in os.listdir(path):
    if filename == 'ipynb_checkpoints':
        continue
    with open(os.path.join(path, filename), encoding='latin-1') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for line in raw_sent:
            story.append(simple_preprocess(line))

In [None]:
story

[['version',
  'history',
  'reedited',
  'by',
  'maelstrom',
  'feast',
  'for',
  'crows',
  'book',
  'four',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'george'],
 ['martin', 'prologue', 'dragons', 'said', 'mollander'],
 ['he',
  'snatched',
  'withered',
  'apple',
  'off',
  'the',
  'ground',
  'and',
  'tossed',
  'it',
  'hand',
  'to',
  'hand'],
 ['throw', 'the', 'apple', 'urged', 'alleras', 'the', 'sphinx'],
 ['he',
  'slipped',
  'an',
  'arrow',
  'from',
  'his',
  'quiver',
  'and',
  'nocked',
  'it',
  'to',
  'his',
  'bowstring'],
 ['should',
  'like',
  'to',
  'see',
  'dragon',
  'roone',
  'was',
  'the',
  'youngest',
  'of',
  'them',
  'chunky',
  'boy',
  'still',
  'two',
  'years',
  'shy',
  'of',
  'manhood'],
 ['should',
  'like',
  'that',
  'very',
  'much',
  'and',
  'should',
  'like',
  'to',
  'sleep',
  'with',
  'rosey',
  'arms',
  'around',
  'me',
  'pate',
  'thought'],
 ['he', 'shifted', 'restlessly', 'on', 'the', 'bench'],
 ['by', '

In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [None]:
model.build_vocab(story)

In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569893, 8628190)

In [None]:
model.wv.most_similar('jaime')

[('tyrion', 0.8344854116439819),
 ('cersei', 0.7249394059181213),
 ('davos', 0.7080197930335999),
 ('brienne', 0.6993142366409302),
 ('kevan', 0.6727877259254456),
 ('ned', 0.6588168144226074),
 ('littlefinger', 0.6570857763290405),
 ('lancel', 0.6560123562812805),
 ('hotah', 0.636487603187561),
 ('joff', 0.6354568600654602)]

In [None]:
model.wv.most_similar('daenerys')

[('targaryen', 0.7952351570129395),
 ('stormborn', 0.7914765477180481),
 ('queen', 0.758385956287384),
 ('myrcella', 0.7543718218803406),
 ('princess', 0.7293612360954285),
 ('margaery', 0.7139602303504944),
 ('elia', 0.7079334855079651),
 ('unburnt', 0.6859560608863831),
 ('dorne', 0.6625966429710388),
 ('aegon', 0.6552737355232239)]

In [None]:
model.wv.similarity('jaime', 'daenerys')

0.2759738

In [None]:
model.wv['deep'].shape

(100,)

In [None]:
model.wv.get_normed_vectors()

array([[-4.2444933e-02, -5.8927905e-02, -5.6220372e-03, ...,
        -7.8752525e-02, -6.2503457e-02,  2.3943419e-02],
       [-1.3338433e-01, -2.2538904e-04,  1.7275307e-02, ...,
        -1.3406304e-02, -1.4161007e-01,  1.3964309e-01],
       [ 1.7813732e-01,  5.8481032e-03, -1.6089272e-01, ...,
        -1.6274465e-02,  1.3912563e-01, -9.5553622e-02],
       ...,
       [ 1.5181997e-02,  1.7535523e-01, -8.2103899e-03, ...,
        -6.9120556e-02, -6.3295126e-02, -2.3587103e-01],
       [-2.0941748e-01,  1.1585047e-01,  1.9259498e-01, ...,
        -1.2213653e-01,  6.2789358e-02,  3.7421089e-02],
       [-1.8236063e-01,  6.8711936e-02, -1.8290852e-01, ...,
        -7.6315582e-02, -1.0802627e-01,  2.9982409e-02]], dtype=float32)

### Applying PCA

In [None]:
pca = PCA(n_components=3)

In [None]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [None]:
X

array([[ 0.15633245,  0.58601135, -0.04911613],
       [ 0.16110326,  0.3416151 ,  0.03580821],
       [-0.30453014,  0.582225  ,  0.22109275],
       ...,
       [-0.26821637, -0.03675854,  0.07695605],
       [ 0.3542716 ,  0.14455372,  0.09101881],
       [ 0.23989727,  0.1108413 , -0.04011407]], dtype=float32)

In [None]:
X.shape

(17453, 3)

In [None]:
y = model.wv.index_to_key

In [None]:
fig = px.scatter_3d(X[200:300], x=0, y=1, z=2, color=y[200:300])
fig.show()