# Custom training for WordtoVector

Data
https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

## Static word embedding

In [3]:
import pandas as pd
import numpy as np
import os
import gensim
import nltk

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [1]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [5]:
for file in os.listdir("/content/data"):
  print(file)

005ssb.txt
004ssb.txt
001ssb.txt
002ssb.txt
003ssb.txt


In [18]:
f=open("/content/data/001ssb.txt",encoding='unicode_escape')

In [19]:
corpus=f.read()

In [20]:
# Convert the data into tokens

In [22]:
raw_sentences=sent_tokenize(corpus)

In [24]:
processed_sentences = []
for sentence in raw_sentences:
    processed_sentences.append(simple_preprocess(sentence))

In [None]:
processed_sentences

In [30]:
preprocessed_sentences = []
for filename in os.listdir("/content/data"):
  file_path=os.path.join("/content/data/",filename)
  with open(file_path,encoding='unicode_escape') as f:
    corpus = f.read()
  raw_sentences=sent_tokenize(corpus)
  for sentence in raw_sentences:
    preprocessed_sentences.append(simple_preprocess(sentence))

In [33]:
preprocessed_sentences[0]

['george',
 'martin',
 'dance',
 'with',
 'dragons',
 'book',
 'five',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'dedication',
 'this',
 'one',
 'is',
 'for',
 'my',
 'fans',
 'for',
 'lodey',
 'trebla',
 'stego',
 'pod',
 'caress',
 'yags',
 'ray',
 'and',
 'mr',
 'kate',
 'chataya',
 'mormont',
 'mich',
 'jamie',
 'vanessa',
 'ro',
 'for',
 'stubby',
 'louise',
 'agravaine',
 'wert',
 'malt',
 'jo',
 'mouse',
 'telisiane',
 'blackfyre',
 'bronn',
 'stone',
 'coyote',
 'daughter',
 'and',
 'the',
 'rest',
 'of',
 'the',
 'madmen',
 'and',
 'wild',
 'women',
 'of',
 'the',
 'brotherhood',
 'without',
 'banners',
 'for',
 'my',
 'website',
 'wizards',
 'elio',
 'and',
 'linda',
 'lords',
 'of',
 'westeros',
 'winter',
 'and',
 'fabio',
 'of',
 'wic',
 'and',
 'gibbs',
 'of',
 'dragonstone',
 'who',
 'started',
 'it',
 'all',
 'for',
 'men',
 'and',
 'women',
 'of',
 'asshai',
 'in',
 'spain',
 'who',
 'sang',
 'to',
 'us',
 'of',
 'bear',
 'and',
 'maiden',
 'fair',
 'and',
 'the

In [32]:
len(preprocessed_sentences)

145020

In [42]:
model = gensim.models.Word2Vec(window=10,min_count=5,vector_size=150)

In [47]:
model.build_vocab(preprocessed_sentences)

In [48]:
model.epochs

5

In [49]:
model.corpus_total_words

1725638

In [50]:
model.min_count

5

In [51]:
model.train(preprocessed_sentences,total_examples=model.corpus_count,epochs=model.epochs)

(6484701, 8628190)

In [52]:
model.wv.most_similar("king")

[('prince', 0.6651734709739685),
 ('baratheon', 0.6596537828445435),
 ('realm', 0.6583049297332764),
 ('throne', 0.6287585496902466),
 ('victory', 0.5916072726249695),
 ('usurper', 0.5774223804473877),
 ('battle', 0.5767452716827393),
 ('council', 0.5684552788734436),
 ('targaryen', 0.5618156790733337),
 ('traitor', 0.5586431622505188)]

In [None]:
model.wv["king"]

In [53]:
model.wv.most_similar("daenerys")

[('stormborn', 0.8289720416069031),
 ('queen', 0.7312886714935303),
 ('targaryen', 0.7119380831718445),
 ('unburnt', 0.7085464596748352),
 ('princess', 0.705463171005249),
 ('myrcella', 0.6774429082870483),
 ('elia', 0.6578168869018555),
 ('margaery', 0.6295678615570068),
 ('viserys', 0.6247549057006836),
 ('khaleesi', 0.624172031879425)]

In [55]:
model.wv.most_similar("dragon")

[('valyria', 0.5933955907821655),
 ('star', 0.5817232728004456),
 ('fortress', 0.578070342540741),
 ('dragons', 0.5649496912956238),
 ('crone', 0.5619562268257141),
 ('comet', 0.5593140125274658),
 ('westeros', 0.5534182786941528),
 ('astapor', 0.5520880222320557),
 ('dynasty', 0.5505040287971497),
 ('veins', 0.5479201674461365)]

In [58]:
model.wv.most_similar("dog")

[('rat', 0.6972994208335876),
 ('rorge', 0.6596587896347046),
 ('duck', 0.6496390700340271),
 ('pig', 0.6433452367782593),
 ('boot', 0.6300278902053833),
 ('shagga', 0.6114288568496704),
 ('whip', 0.5999821424484253),
 ('beast', 0.5972452163696289),
 ('snap', 0.5923657417297363),
 ('knife', 0.5863524079322815)]

In [60]:
model.wv.doesnt_match(['jon','rikon'])



'jon'

In [63]:
model.wv.get_normed_vectors().shape

(11760, 150)

In [None]:
model.wv.index_to_key

In [66]:
len(model.wv.index_to_key)

11760

In [67]:
model.wv.index_to_key[0]

'the'

In [68]:
model.save("/content/data/word2vec.model")

In [73]:
model.save("word2vec.bin")

In [74]:
!gzip word2vec.bin > word2vec.bin.gz

gzip: word2vec.bin.gz already exists; do you wish to overwrite (y or n)? y


In [75]:
vector=model.wv.get_normed_vectors()

In [80]:
y=model.wv.index_to_key

In [78]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X=pca.fit_transform(vector)

In [79]:
X.shape

(11760, 3)

In [81]:
import plotly.express as px
fig = px.scatter_3d(X[200:250],x=0,y=1,z=2, color=y[200:250])
fig.show()