In [1]:
# Import necessary libraries
import gensim
from gensim.utils import simple_preprocess

import os
import sys

import nltk
from nltk.tokenize import sent_tokenize

In [2]:
# Download the punkt tokenizer models, necessary for sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Define the folder path containing text files
folder_path = "/content/data"
os.listdir(folder_path)

['001ssb.txt', '003ssb.txt', '004ssb.txt', '005ssb.txt', '002ssb.txt']

In [4]:
# Initialize an empty list to store the tokenized sentences
corpus = []

In [5]:
# Iterate over each file in the specified folder
for file_name in os.listdir(folder_path):

  # Open and read the file with latin-1 encoding
  with open(os.path.join(folder_path,file_name),"r",encoding="latin-1") as f:
    story = f.read()
    # Tokenize the story into sentences
    sent = sent_tokenize(story)
    # Process each sentence and append to the corpus list
    for each_sent in sent:
      corpus.append(simple_preprocess(each_sent))

In [6]:
# Print the processed corpus
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
# Initialize the Word2Vec model with specified parameters
model = gensim.models.Word2Vec(
    window = 7, # Context window size
    min_count = 2, # Ignores all words with total frequency lower than this
    workers = 8, # Number of worker threads to train the model
    vector_size=200, # Dimensionality of the word vectors
    epochs = 10 # Number of training epochs
)

In [8]:
# Build the vocabulary from the corpus
model.build_vocab(corpus)

In [9]:
# Train the Word2Vec model
model.train(corpus,total_examples = model.corpus_count, epochs = model.epochs)

(13141525, 17256380)

In [10]:
# Print the number of sentences in the corpus
model.corpus_count

145020

In [11]:
# Print the length of the corpus (number of sentences)
len(corpus)

145020

In [12]:
# Retrieve and print the word vector for the word 'throne'
model.wv["throne"]

array([ 1.0838825 ,  0.44056413, -0.41874883,  0.48939627, -2.5058315 ,
        0.06869879, -2.1402533 ,  1.5708557 ,  0.43150797, -0.48248255,
       -0.494238  ,  0.7415264 ,  0.12482452,  0.23526809, -0.59445155,
       -0.18206677, -0.05858791,  0.45093322,  1.1207201 ,  1.2146001 ,
       -0.39321497,  0.50783753,  0.01975966, -1.2631065 ,  0.31582224,
       -2.0704246 ,  0.42457506, -1.5545381 , -0.07198357, -0.43726268,
       -0.05650353, -0.41103035, -0.59951645,  1.103096  ,  2.388293  ,
        0.60482407,  0.99062324,  0.21562128, -0.7714702 , -0.18797886,
        0.5471999 ,  0.640209  ,  1.3643793 ,  1.4811015 ,  1.6434354 ,
        0.44364035, -0.46120715,  0.92413795, -1.2920166 , -0.6663538 ,
        0.42058116,  0.743507  , -0.16510111, -1.3036686 , -1.9961033 ,
        1.2525119 ,  2.1695375 ,  1.3706516 ,  0.06127944, -0.2947577 ,
        0.6877137 ,  0.45470208, -0.75404114, -0.9539877 , -0.20409752,
       -0.7925026 , -0.05935948,  0.22135358,  0.7138606 , -0.33

In [13]:
# Print the length of the word vector for 'throne'
len(model.wv["throne"])

200

In [14]:
# Calculate and print the similarity between 'arya' and 'sansa'
model.wv.similarity("arya","sansa")

0.8036768

In [15]:
# Calculate and print the similarity between 'jaime' and 'sansa'
model.wv.similarity("jaime","sansa")

0.48602727

In [16]:
# Calculate and print the similarity between 'dragon' and 'daenerys'
model.wv.similarity("dragon","daenerys")

0.52477515

In [17]:
# Print the most similar words to 'king'
model.wv.most_similar("king")

[('baratheon', 0.6260662078857422),
 ('realm', 0.5750989317893982),
 ('site', 0.5267443060874939),
 ('ninth', 0.518090009689331),
 ('royal', 0.5038226246833801),
 ('feinting', 0.46570757031440735),
 ('mng', 0.46245068311691284),
 ('usurper', 0.4613538682460785),
 ('tourney', 0.4612545967102051),
 ('conqueror', 0.4485936164855957)]

In [18]:
# Find and print the word that does not match in the list ['jaime', 'cersie', 'jon']
model.wv.doesnt_match(["jaime","cersie","jon"])



'jon'