In [1]:
import numpy as np
import pandas as pd

In [3]:
# !pip install gensim

In [4]:
# !pip install numpy==1.23.5  


In [5]:
# !pip uninstall numpy -y
# !pip install --upgrade numpy


In [6]:
# !pip install numpy==1.21.0  

In [7]:
# !pip uninstall setuptools -y

In [8]:
import numpy
print(numpy.__version__)  # Should show 1.23.5 or higher
from numpy import rec     # Should work without errors

1.26.4


In [2]:
import gensim
import os

In [10]:
# Import required libraries
from nltk import sent_tokenize       # For splitting text into sentences
from gensim.utils import simple_preprocess  # For basic text preprocessing
import os                           # For file system operations

# Initialize an empty list to store all processed sentences
story = []

# Define the path to the folder containing text files
folder_path = r'D:\CODING\PYTHON\NLP\3.CBOW and Skip-gram\game of thorons_text file'

# Loop through each file in the specified folder
for filename in os.listdir(folder_path):
    # Create full file path by joining folder path and filename
    filepath = os.path.join(folder_path, filename)
    
    # Try to read the file with UTF-8 encoding first (common for modern text files)
    try:
        with open(filepath, encoding='utf-8') as f:  # 'with' ensures proper file handling
            corpus = f.read()  # Read entire file content into a string
    
    # If UTF-8 fails (common with some Windows files), try Windows-1252 encoding
    except UnicodeDecodeError:
        with open(filepath, encoding='cp1252') as f:  # cp1252 is common Windows encoding
            corpus = f.read()
    
    # Split the text into sentences using NLTK's sentence tokenizer
    raw_sent = sent_tokenize(corpus)
    
    # Process each sentence
    for sent in raw_sent:
        # Apply simple preprocessing and add to our story list
        story.append(simple_preprocess(sent))
        # simple_preprocess does:
        # 1. Converts to lowercase
        # 2. Tokenizes (splits into words)
        # 3. Removes punctuation
        # 4. Removes very short/long tokens

In [11]:
story

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [15]:
# Creating the Word2Vec model
# The Word2Vec model is initialized with specific parameters:
# - window: The maximum distance between the current and predicted word within a sentence.
# - min_count: Ignores all words with total frequency lower than this.
# - workers: The number of worker threads to train the model (default is 3).
# - sg: If 1, the skip-gram model is used; if 0
# (default), CBOW is used.
# - vector_size: The dimensionality of the word vectors.
# window=10 means the model will consider a context window of 10 words around the target word.
# min_count=2 means that words that appear less than 2 times in the corpus will be ignored.
# workers=3 means that the model will use 3 threads for training, which
# can speed up the training process on multi-core machines.
# sg=1 means that the skip-gram model will be used, which is generally better
# for larger datasets and captures more semantic relationships between words.
# vector_size=100 means that the word vectors will have 100 dimensions, which is a

model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [16]:
# Building the vocabulary for the Word2Vec model
# The build_vocab method takes the list of sentences (story) and creates a vocabulary
# from the words in those sentences.
# This step is crucial as it prepares the model to learn word representations based on the context provided by the sentences.

model.build_vocab(story)

In [17]:


model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6570674, 8628190)

In [18]:
# You can also perform vector arithmetic with the model
# For example, to find a word that is similar to 'king' in the context of


model.wv.most_similar('daenerys')

[('stormborn', 0.8104311227798462),
 ('unburnt', 0.7868256568908691),
 ('targaryen', 0.7594308853149414),
 ('queen', 0.7203812003135681),
 ('myrcella', 0.7130403518676758),
 ('princess', 0.7012914419174194),
 ('viserys', 0.6835329532623291),
 ('elia', 0.6820120215415955),
 ('margaery', 0.6606749296188354),
 ('joffrey', 0.6552296876907349)]

In [19]:
# You can also perform vector arithmetic with other words
# For example, to find a word that is similar to 'king' in the context of


model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [20]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [21]:
# To get the vector representation of a specific word, you can use the following code:
# This will return the vector representation of the word 'king' in the model's vocabulary.
# 100 dimensional vector representation of the word 'king'

model.wv['king']

array([ 1.4489496 ,  0.8382674 ,  2.1292367 ,  0.52274156, -0.78328   ,
        0.85966194,  1.0747448 ,  0.97482985, -2.720247  ,  0.87120074,
       -2.2958708 , -1.2577258 , -0.45530212,  0.9764137 , -2.6230154 ,
       -2.4480906 ,  1.1474416 ,  2.085778  ,  0.8564607 ,  1.6173768 ,
        1.7561256 , -1.0151191 ,  1.3975065 , -4.16732   , -1.8679423 ,
        1.0470644 , -2.3382227 , -1.1197519 , -0.2670139 ,  0.64288616,
       -2.446803  ,  0.89465034,  1.3002933 , -1.3660592 ,  2.2353814 ,
       -4.2278433 , -3.588104  , -0.11431959,  0.8705074 , -2.7083113 ,
        0.03225828,  2.3652055 ,  1.6967033 , -0.531214  , -0.95086944,
       -0.75490314,  1.9409578 , -1.6352273 ,  2.043879  , -1.2215078 ,
       -3.4837582 , -1.7981517 , -1.8183593 , -2.9497056 ,  2.809812  ,
       -3.0828042 ,  0.65063876,  1.3232509 , -0.66221684,  1.0126193 ,
        1.1279368 ,  2.3267028 , -1.3957466 , -1.6746467 ,  0.7512887 ,
        2.2388637 , -1.0482602 , -0.31630906, -0.0724933 , -1.58

In [22]:
model.wv.similarity('arya','sansa')

0.84637785

In [23]:
model.wv.similarity('cersei','sansa')

0.73215216

In [24]:
model.wv.similarity('tywin','sansa')

0.24099329

In [25]:
# To get the normalized vectors for all words in the vocabulary, you can use the following code:
# This will return a dictionary where keys are words and values are their normalized vector representations.


model.wv.get_normed_vectors()

array([[-0.09262253, -0.16068663,  0.10508421, ..., -0.10980255,
        -0.00323108, -0.08298406],
       [-0.14406595, -0.17370121,  0.07170802, ..., -0.03595421,
        -0.02070964,  0.11747561],
       [ 0.19273481, -0.05109106, -0.02450764, ...,  0.00129584,
         0.03065296, -0.03754126],
       ...,
       [ 0.07505529,  0.10361873, -0.07235458, ..., -0.04875888,
         0.14545533, -0.11078382],
       [ 0.0215194 , -0.01695022,  0.12871556, ..., -0.03430941,
         0.10701032, -0.03549737],
       [-0.08517104,  0.10122213,  0.11249706, ..., -0.03627158,
         0.06868757, -0.11152586]], dtype=float32)

In [26]:
# To get the list of words in the vocabulary, you can use the following code:
# This will return a list of words that the model has learned from the training data.
# This is useful for understanding the vocabulary size and the words that the model can work with.
# This will return a list of words that the model has learned from the training data.
# This is useful for understanding the vocabulary size and the words that the model can work with.

y = model.wv.index_to_key

In [27]:
y

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'her',
 'in',
 'it',
 'had',
 'that',
 'she',
 'as',
 'with',
 'him',
 'not',
 'but',
 'for',
 'they',
 'is',
 'at',
 'on',
 'said',
 'my',
 'have',
 'be',
 'lord',
 'them',
 'no',
 'from',
 'would',
 'were',
 'me',
 'your',
 'one',
 'all',
 'when',
 'will',
 'ser',
 'if',
 'so',
 'their',
 'we',
 'could',
 'are',
 'man',
 'there',
 'this',
 'up',
 'been',
 'what',
 'did',
 'by',
 'king',
 'do',
 'men',
 'back',
 'out',
 'more',
 'or',
 'who',
 'down',
 'well',
 'than',
 'only',
 'like',
 'jon',
 'some',
 'father',
 'old',
 'hand',
 'even',
 'too',
 'tyrion',
 'before',
 'never',
 'an',
 'off',
 'see',
 'know',
 'into',
 'made',
 'now',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'then',
 'how',
 'long',
 'has',
 'can',
 'might',
 'us',
 'come',
 'where',
 'here',
 'through',
 'still',
 'face',
 'head',
 'red',
 'll',
 'way',
 'boy',
 'page',
 'must',
 'once',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 

In [28]:
# To visualize the word vectors, you can use PCA (Principal Component Analysis) to reduce the dimensionality of the vectors to 2D or 3D for plotting.

from sklearn.decomposition import PCA

In [29]:
# Create a PCA object to reduce the dimensionality of the word vectors
# 

pca = PCA(n_components=3)

In [30]:
# Fit the PCA model to the normalized word vectors and transform them
# This will reduce the dimensionality of the word vectors to 3 dimensions for visualization purposes.


X = pca.fit_transform(model.wv.get_normed_vectors())

In [31]:
X.shape

(17453, 3)

In [32]:
# To visualize the word vectors in 3D, you can use libraries like Matplotlib or Plotly. Here, we'll use Plotly for an interactive 3D scatter plot.
# 

import plotly.express as px
fig = px.scatter_3d(X[:100],x=0,y=1,z=2, color=y[200:300])
fig.show()

In [None]:
# !pip install nbformat>=4.2.0