In [None]:
import gensim
import pandas as pd
import numpy as np
import os
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
story = []

for filename in os.listdir('/content/game-of-thrones'):
    with open(os.path.join('/content/game-of-thrones', filename), encoding='latin-1') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))


In [None]:
# Define stopwords and remove them
stop_words = set(stopwords.words('english'))

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

data = remove_stopwords(story)

# Verify length of story and data
print(len(story))  # 145020
print(len(data))   # Should be same as length of story

145020
145020


In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [None]:
model.build_vocab(data)

In [None]:
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)

(4396217, 4579390)

In [None]:
model.wv.most_similar('jaime')

[('tyrion', 0.8472510576248169),
 ('cersei', 0.8014019131660461),
 ('kevan', 0.7467201948165894),
 ('hound', 0.6972604990005493),
 ('bronn', 0.6875075101852417),
 ('lancel', 0.6775570511817932),
 ('davos', 0.6748440861701965),
 ('pays', 0.6690428853034973),
 ('brienne', 0.6685553789138794),
 ('dwarf', 0.6606552600860596)]

In [None]:
model.wv.doesnt_match(['jaime', 'cersei', 'tyrion'])

'cersei'

In [None]:
model.wv.get_normed_vectors()

array([[-0.11062504,  0.05429151, -0.06310194, ..., -0.07929287,
        -0.01840436,  0.05514219],
       [-0.22617976,  0.08376346,  0.05367316, ..., -0.06716409,
        -0.09641515,  0.08596537],
       [-0.0609105 ,  0.07697473,  0.01392735, ..., -0.17078093,
         0.1214296 , -0.15063609],
       ...,
       [ 0.08726951,  0.07172834,  0.05069776, ..., -0.11678503,
         0.04626037, -0.00538488],
       [-0.04025996,  0.09059471,  0.12207762, ..., -0.1504996 ,
         0.01317632,  0.05398804],
       [ 0.03879052, -0.01281479,  0.09638458, ...,  0.02691613,
        -0.03982624,  0.04650163]], dtype=float32)

In [None]:
model.wv.index_to_key

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'king',
 'men',
 'back',
 'well',
 'like',
 'jon',
 'father',
 'old',
 'hand',
 'even',
 'tyrion',
 'never',
 'know',
 'see',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'long',
 'might',
 'us',
 'come',
 'still',
 'face',
 'head',
 'red',
 'way',
 'boy',
 'page',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'three',
 'away',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'day',
 'white',
 'jaime',
 'first',
 'look',
 'want',
 'much',
 'enough',
 'sword',
 'tell',
 'girl',
 'bran',
 'great',
 'looked',
 'left',
 'knew',
 'asked',
 'gave',
 'maester',
 'called',
 'wall',
 'every',
 'heard',
 'sansa',
 'let',
 'yet',
 'went',
 'turned',
 'dany',
 'need',
 'behind',
 'around',
 'woman',
 'another',
 'snow',
 'beneath',
 'across',
 'knight',
 'keep',
 'grace',
 'found',
 'gold',
 'last',
 'cersei',
 'castle',
 '

In [None]:
print(story[:1])  # Print the first preprocessed sentence
print(data[:1])   # Print the first sentence after stopword removal

[['clash', 'of', 'kings', 'book', 'two', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george', 'martin', 'prologue', 'the', 'comet', 'tail', 'spread', 'across', 'the', 'dawn', 'red', 'slash', 'that', 'bled', 'above', 'the', 'crags', 'of', 'dragonstone', 'like', 'wound', 'in', 'the', 'pink', 'and', 'purple', 'sky']]
[['clash', 'kings', 'book', 'two', 'song', 'ice', 'fire', 'george', 'martin', 'prologue', 'comet', 'tail', 'spread', 'across', 'dawn', 'red', 'slash', 'bled', 'crags', 'dragonstone', 'like', 'wound', 'pink', 'purple', 'sky']]


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA(n_components=3)
result = pca.fit_transform(model.wv.get_normed_vectors())

In [None]:
result.shape

(17310, 3)

In [None]:
result_df = pd.DataFrame(result, columns=['x', 'y', 'z'])

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter_3d(result_df[:100],x='x', y='y', z='z',color=result_df.index[:100]) # Corrected here
fig.show()