In [3]:
# First, install the Kaggle library
!pip install kaggle

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# Set permissions
!chmod 600 ~/.kaggle/kaggle.json

# Now, download the dataset
!kaggle datasets download -d khulasasndh/game-of-thrones-books


Downloading game-of-thrones-books.zip to /content
  0% 0.00/3.71M [00:00<?, ?B/s]
100% 3.71M/3.71M [00:00<00:00, 175MB/s]


In [4]:
!unzip "/content/game-of-thrones-books.zip"

Archive:  /content/game-of-thrones-books.zip
  inflating: 001ssb.txt              
  inflating: 002ssb.txt              
  inflating: 003ssb.txt              
  inflating: 004ssb.txt              
  inflating: 005ssb.txt              


In [5]:
import numpy as np
import pandas as pd

In [6]:
!pip install gensim



In [7]:
import gensim
import os

In [23]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []
for filename in os.listdir('data'):
    with open(os.path.join('data', filename), 'r', encoding='latin1') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))


In [16]:
len(story)

145020

In [19]:
story[3]

['ser',
 'waymar',
 'royce',
 'asked',
 'with',
 'just',
 'the',
 'hint',
 'of',
 'smile']

In [24]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
import os
from nltk import sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

# Load English stopwords
stop_words = set(stopwords.words('english'))

story = []
for filename in os.listdir('data'):
    with open(os.path.join('data', filename), 'rb') as f:  # Open in binary mode
        for encoding in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
            try:
                corpus = f.read().decode(encoding)
                raw_sent = sent_tokenize(corpus)
                for sent in raw_sent:
                    # Tokenize the sentence and remove stopwords
                    words = [word for word in simple_preprocess(sent) if word not in stop_words]
                    story.append(words)
                break  # Stop trying encodings once successful
            except UnicodeDecodeError:
                continue  # Try next encoding if decoding fails

# Process the rest of the code after successfully reading the file


In [28]:
story[3]

['ser', 'waymar', 'royce', 'asked', 'hint', 'smile']

In [29]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [30]:
model.build_vocab(story)


In [31]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(2543853, 2670660)

In [32]:
model.wv.most_similar('daenerys')


[('tyrek', 0.9839968085289001),
 ('sold', 0.9839838147163391),
 ('also', 0.9839157462120056),
 ('champion', 0.9839004874229431),
 ('attend', 0.9821378588676453),
 ('alliance', 0.9814395904541016),
 ('cousins', 0.9812595248222351),
 ('melee', 0.9809667468070984),
 ('debt', 0.9809210896492004),
 ('beggar', 0.9807302355766296)]

In [33]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])




'robb'

In [34]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])


'cersei'

In [35]:
model.wv['king']


array([ 0.25602776,  0.26547092,  1.6206598 ,  1.6704789 , -0.2995112 ,
       -0.45968097, -0.4338229 ,  0.89212537, -0.7924397 , -1.1784204 ,
       -0.22979261,  0.26268005, -0.07696856,  0.24034145,  0.6662791 ,
        0.8730665 ,  0.41238302,  0.4826117 ,  0.04244886, -1.5235845 ,
        0.20945565, -0.09189375,  0.6761286 , -0.64334553, -0.68832016,
        1.5446944 , -1.1940342 ,  0.7550828 , -2.1699486 ,  0.6442482 ,
       -1.3578988 , -0.05010698,  0.4763776 , -0.4448845 , -0.27127004,
        0.1263027 , -2.0573678 , -1.8750234 , -0.90239096,  0.61964613,
        1.1696852 , -1.1973239 , -2.4275825 ,  1.537763  ,  1.2700374 ,
       -0.34342498, -2.0785484 , -0.67705274,  2.167857  ,  0.7949753 ,
        0.6295307 , -1.0405912 ,  0.71849114, -1.2974807 ,  0.24648994,
        0.61162096, -0.414091  ,  1.2053113 ,  0.15740009,  1.9063225 ,
       -0.03064514, -0.3285439 , -1.6778435 ,  1.1275381 , -0.26477534,
        0.4783327 , -1.214553  ,  2.5409386 , -0.70457375,  0.12

In [36]:
model.wv.similarity('arya','sansa')


0.82924247

In [37]:
model.wv.similarity('cersei','sansa')


0.68437564

In [38]:
model.wv.similarity('tywin','sansa')


0.351167

In [39]:
y = model.wv.index_to_key


In [40]:
y

['said',
 'lord',
 'would',
 'ser',
 'one',
 'could',
 'man',
 'king',
 'back',
 'jon',
 'men',
 'well',
 'like',
 'page',
 'tyrion',
 'father',
 'hand',
 'see',
 'even',
 'never',
 'know',
 'old',
 'told',
 'eyes',
 'black',
 'made',
 'thought',
 'lady',
 'arya',
 'long',
 'time',
 'come',
 'brother',
 'face',
 'head',
 'bran',
 'boy',
 'sansa',
 'might',
 'still',
 'us',
 'way',
 'red',
 'must',
 'took',
 'night',
 'good',
 'came',
 'away',
 'catelyn',
 'say',
 'two',
 'saw',
 'go',
 'robb',
 'little',
 'son',
 'looked',
 'take',
 'dead',
 'sword',
 'look',
 'yet',
 'make',
 'great',
 'maester',
 'ned',
 'tell',
 'blood',
 'three',
 'stark',
 'dany',
 'though',
 'around',
 'want',
 'first',
 'gave',
 'knew',
 'heard',
 'day',
 'enough',
 'left',
 'asked',
 'half',
 'wall',
 'much',
 'went',
 'white',
 'jaime',
 'turned',
 'behind',
 'let',
 'robert',
 'think',
 'lannister',
 'gods',
 'keep',
 'horse',
 'every',
 'queen',
 'across',
 'castle',
 'joffrey',
 'girl',
 'called',
 'found',

In [41]:
from sklearn.decomposition import PCA


In [42]:
pca = PCA(n_components=3)


In [43]:
X = pca.fit_transform(model.wv.get_normed_vectors())


In [44]:
X.shape


(13632, 3)

In [45]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()