In [59]:
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from nltk.corpus import gutenberg
from string import punctuation
import numpy as np
import re
import nltk
import pandas as pd

![title](./CBOW_Image.png)

### Python concpets learned:
### iterables, iterators, enumerators, generators
### yield function

## Step 1: Building the corpus vocabulary

In [2]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    # lower case and remove special characters and numbers and other non whitespace characters
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [3]:
# Loading the bible corpus
bible = gutenberg.sents('bible-kjv.txt')
remove_terms  = punctuation + '0123456789' # all the punctuations plus digits will be removed

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
# converting each sent from list of strings to strings
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible] 
# Filter empty sentences
norm_bible = filter(None,normalize_corpus(norm_bible))
# Keep only sentences with more than 2 words
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

In [4]:
print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


In [6]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id  = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k,v in word2id.items()}

# this is a list of list containg the ids of each word in a sentence
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)]for doc in norm_bible]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # Context window size

print(f'Vocabulary size: {vocab_size}')
print(f'Vocabulary sample: {list(word2id.items())[:10]}')

Vocabulary size: 12425
Vocabulary sample: [('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5), ('god', 6), ('ye', 7), ('said', 8), ('thee', 9), ('upon', 10)]


In [7]:
wids

[[13, 1154, 5766],
 [154, 2450, 13, 1154, 5766],
 [132, 310, 63, 86, 8480],
 [582, 6, 1180, 94, 47],
 [47, 136, 1883, 1884, 396, 10, 144, 860],
 [111, 6, 759, 10, 144, 212],
 [6, 8, 27, 232, 232],
 [6, 101, 232, 75, 6, 826, 232, 396],
 [6, 86, 232, 23, 396, 86, 197],
 [926, 287, 132, 23],
 [6, 8, 27, 2351, 161, 212, 27, 1096, 212, 212],
 [6, 32, 2351, 826, 212, 2351, 212, 2351],
 [6, 86, 2351, 94],
 [926, 287, 363, 23],
 [6, 8, 27, 212, 94, 237, 117, 2, 20, 76, 27, 796, 24, 1011],
 [6, 86, 796, 24, 47, 3075, 117, 212, 86, 1827, 6, 101, 75],
 [6,
  8,
  27,
  47,
  74,
  56,
  891,
  2199,
  3942,
  223,
  308,
  318,
  3942,
  308,
  1181,
  193,
  223,
  10,
  47],
 [47,
  62,
  56,
  891,
  2199,
  3942,
  223,
  1181,
  318,
  3942,
  308,
  193,
  223,
  1181,
  6,
  101,
  75],
 [926, 287, 343, 23],
 [6,
  8,
  27,
  3235,
  2351,
  94,
  1096,
  23,
  197,
  27,
  1039,
  2930,
  61,
  106,
  27,
  3235,
  2351,
  94,
  57,
  232,
  10,
  47],
 [6, 32, 64, 49, 3235, 740, 232, 845

## Step 2: Build a CBOW (context_words, target_word) generator

In [8]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2  
    # we take window_size number of words from either side of target word

    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen = context_length)
            y = to_categorical(label_word, vocab_size) #one hot encoding of target word
            yield(x,y)

In [9]:
# Test this for some samples
i= 0
for x, y in generate_context_word_pairs(corpus = wids,window_size=window_size,
                                        vocab_size = vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    if i ==10:
        break
    i +=1

Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses


In [10]:
x

array([[ 132,  310,   86, 8480]], dtype=int32)

## Build CBOW model architectur 

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
import tensorflow.keras.backend as K

In [12]:
cbow = Sequential()

# Add Embedding layer
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size)) #, input_length=window_size*2))

# Add Lambda layer to average the embeddings
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))

# Add Dense layer with softmax activation for classification
cbow.add(Dense(vocab_size, activation='softmax'))

# Compile the model with categorical crossentropy loss and RMSprop optimizer
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

2025-08-01 18:30:09.751814: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-08-01 18:30:09.751868: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-08-01 18:30:09.751876: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2025-08-01 18:30:09.752275: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-01 18:30:09.752301: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


None


## Step 3 Training the  model

In [13]:
import time
start_time = time.time()
print("Start")
for epoch in range(1,6):
    loss  = 0.
    i=0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        # This function produces the input-output pairs (context-target pairs) one by one, 
        # which are used for training the model
        i += 1 
        
        loss += cbow.train_on_batch(x,y)
        # this trains the model (cbow) on a single batch of data, 
        # represented by x (context words) and y (target words).
        
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))
    
    print('Epoch:', epoch, '\tLoss:', loss)
    print()

print(f"total time taken for training: {time.time() - start_time:.2f} seconds")

Start


2025-08-01 18:30:11.036604: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 1 	Loss: 3111962.471206665

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 2 	Loss: 3179999.665837288

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 3 	Loss: 3290313.5123615265

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 4 	Loss: 3411678.5043182373

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 5 	Loss: 3521050.139562607

total time taken for training: 45855.22 seconds


In [None]:
# The Embedding layer in Keras is used to learn dense vector representations for words (or tokens) in a model.
#  The Lambda layer in Keras allows you to define a custom operation that applies a function to the input data. 

## Get Word Embeddings 

In [66]:
# To get word embeddings for our entire vocabulary, 
# we can extract out the same from our embedding layer 

# we are only interested in the target word embedding layer, 
# hence we will extract the embeddings from our word_model embedding layer.
# We don’t take the embedding at position 0 since none of our words in the vocabulary have a numeric identifier of 0 and we ignore it.

In [16]:
cbow.save('cbow_model.h5')



In [10]:
from tensorflow.keras.models import load_model
loaded_model = load_model('cbow_model.h5')

2025-08-04 17:43:04.310277: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-08-04 17:43:04.310342: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-08-04 17:43:04.310350: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2025-08-04 17:43:04.310384: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-04 17:43:04.310400: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [68]:
# To get word embeddings for our entire vocabulary, we can extract out the same from our embedding layer.
# We don’t take the embedding at position 0 since it belongs to the padding (PAD) term which is not really a word of interest.
# 1st layer and 3rd layer (embedding layer and output layeris of 12425 (vocab_size)
# 2nd layer (Labmda Layer is of size embed_size)
weights = loaded_model.get_weights()[0]

In [69]:
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(12424, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
unto,0.268048,-0.273799,4.260375,1.059516,-2.74749,-0.66399,3.768331,-0.422325,0.724439,1.292332,...,-1.261524,0.051955,1.328049,0.384559,-1.47057,0.812601,-0.171075,-1.469633,1.1315,1.181283
lord,0.343055,0.570671,6.503984,0.27641,-1.987109,0.922307,4.997999,0.01298,-0.618147,-0.770794,...,1.458647,-1.599883,-0.0357,0.66213,-1.5405,-2.060878,-1.062591,-1.212371,-0.737205,3.14338
thou,-0.703797,-0.568758,6.493713,0.880852,-1.418226,0.734056,0.60428,-0.125397,-0.215498,0.547633,...,0.58086,-1.412126,1.837828,0.175498,-0.158398,-1.799214,0.039377,-1.222945,0.488197,1.080713
thy,-1.032889,1.136342,6.948893,-0.533972,-0.340737,0.222113,2.469668,-1.466266,-1.595461,-0.139459,...,-1.801109,-1.961103,1.126,0.400588,-1.623017,0.326285,1.921631,-0.089859,-1.390419,0.088724
god,-0.254096,0.007649,5.905819,-0.041232,-1.252255,-0.218915,0.602217,-0.153425,-0.106201,-0.257419,...,-0.255381,-1.107631,0.651743,-0.18463,0.041767,-1.122102,0.480924,-0.535106,0.722298,0.118097


In [80]:
# each word has a dense embedding of size (1x100) as depicted in the preceding output.
# Now lets find out some contextually similar words for specific words of interest based on these embeddings

# For this, we build out a pairwise distance matrix amongst all the words in our vocabulary based on the dense embedding vectors 
# and then find out the n-nearest neighbors of each word of interest based on the shortest (euclidean) distance.
    
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses','famine']}

similar_words


(12424, 12424)


{'god': ['hath', 'let', 'behold', 'also', 'may'],
 'jesus': ['disciples', 'peter', 'john', 'paul', 'world'],
 'noah': ['sarah', 'abram', 'lived', 'shem', 'sarai'],
 'egypt': ['brought', 'bring', 'give', 'say', 'behold'],
 'john': ['peter', 'simon', 'james', 'pharisees', 'disciples'],
 'gospel': ['church', 'preached', 'baptism', 'pilate', 'baptized'],
 'moses': ['aaron', 'priest', 'congregation', 'tabernacle', 'commanded'],
 'famine': ['age', 'obeyed', 'pestilence', 'graves', 'breach']}

In [78]:
distance_matrix

array([[ 0.        , 17.416199  , 13.394743  , ..., 18.928429  ,
        18.902372  , 18.907211  ],
       [17.416199  ,  0.        , 14.667802  , ..., 23.526396  ,
        23.615795  , 23.604603  ],
       [13.394743  , 14.667802  ,  0.        , ..., 15.908736  ,
        15.960396  , 15.972894  ],
       ...,
       [18.928429  , 23.526396  , 15.908736  , ...,  0.        ,
         0.46212232,  0.45527917],
       [18.902372  , 23.615795  , 15.960396  , ...,  0.46212232,
         0.        ,  0.42937815],
       [18.907211  , 23.604603  , 15.972894  , ...,  0.45527917,
         0.42937815,  0.        ]], dtype=float32)

In [None]:
# Resource used
# https://www.tensorflow.org/text/tutorials/word2vec
# https://medium.com/@manansuri/a-dummys-guide-to-word2vec-456444f3c673
# https://www.analyticsvidhya.com/blog/2021/07/word2vec-for-word-embeddings-a-beginners-guide/
# https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html
# https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa/

In [2]:
!pwd

/Volumes/Storage/Learning & Development/NLP_Basics


In [3]:
!ls

[31mCBOW_Image.png[m[m [31mcbow_model.h5[m[m  [31mREADME.md[m[m      [31mWord2Vec.ipynb[m[m
