In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data (if not already)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Shivani
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Shivani
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\Shivani
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...


True

In [2]:
# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Text Preprocessing

In [3]:
def preprocess_manual(sentence):
    sentence = sentence.lower()                           # lowercase
    sentence = re.sub(r'[^\w\s]', '', sentence)          # (,.?/)remove punctuation
    tokens = sentence.split()                             # tokenize
    tokens = [w for w in tokens if w not in stop_words]  # (i,the,a,an)remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens]   # lemmatize
    return tokens

## Corpus For Preprocessing 

In [4]:
sentences = [
    "I love natural language processing",
    "Word2Vec creates vector representations of words",
    "Gensim makes working with word embeddings easy",
    "Python is great for NLP tasks"
]

tokenized_sentences = [preprocess_manual(s) for s in sentences]
print("Preprocessed corpus:", tokenized_sentences)

Preprocessed corpus: [['love', 'natural', 'language', 'processing'], ['word2vec', 'creates', 'vector', 'representation', 'word'], ['gensim', 'make', 'working', 'word', 'embeddings', 'easy'], ['python', 'great', 'nlp', 'task']]


## Build Vocabulary

In [16]:
# Flatten list of words
words = [word for sentence in tokenized_sentences for word in sentence]
vocab = sorted(set(words))

word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(vocab)

print("Vocabulary:", vocab)

Vocabulary: ['creates', 'easy', 'embeddings', 'gensim', 'great', 'language', 'love', 'make', 'natural', 'nlp', 'processing', 'python', 'representation', 'task', 'vector', 'word', 'word2vec', 'working']


## Creating Training Data For CBOW

In [6]:
def generate_cbow_data(corpus, window=2):
    data = []
    for sentence in corpus:
        for i, target in enumerate(sentence):
            context = []
            for j in range(i - window, i + window + 1):
                if j != i and j >= 0 and j < len(sentence):
                    context.append(sentence[j])
            if context:
                data.append((context, target))
    return data

training_data = generate_cbow_data(tokenized_sentences)
print("Example training pair:", training_data[0])

Example training pair: (['natural', 'language'], 'love')


## One Hot Encoding

In [7]:
def one_hot(word, word2idx):
    vec = np.zeros(len(word2idx))
    vec[word2idx[word]] = 1
    return vec

## Initialize Model Parameters

In [8]:
embedding_dim = 10   # small for demo
W1 = np.random.rand(vocab_size, embedding_dim)   # input -> hidden
W2 = np.random.rand(embedding_dim, vocab_size)   # hidden -> output

## Training CBOW Model

In [10]:
learning_rate = 0.01
epochs = 500

for epoch in range(epochs):
    loss = 0
    for context_words, target_word in training_data:
        # 1. Average context vectors
        x = np.mean([one_hot(w, word2idx) for w in context_words], axis=0)

        # 2. Hidden layer
        h = np.dot(W1.T, x)

        # 3. Output layer (softmax)
        u = np.dot(W2.T, h)
        y_pred = np.exp(u) / np.sum(np.exp(u))

        # 4. True vector
        y_true = one_hot(target_word, word2idx)

        # 5. Loss (cross-entropy)
        loss += -np.sum(y_true * np.log(y_pred + 1e-9))
        
        # 6. Backpropagation
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2, e))

        # 7. Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 100, Loss: 40.4820
Epoch 200, Loss: 24.2535
Epoch 300, Loss: 14.6094
Epoch 400, Loss: 9.6549
Epoch 500, Loss: 7.1264


## Extract Word Embeddings

In [11]:
def get_vector(word):
    return W1[word2idx[word]]

print("Vector for 'python':", get_vector('python'))

Vector for 'python': [ 1.64131423  0.95906346  1.29693459  0.877179   -0.62526354  0.48917783
 -0.15221688  0.46896776  0.9260571   0.5868147 ]


## Generate Skip-Gram Training Approach

In [12]:
# Create training data for Skip-Gram
def generate_skipgram_data(corpus, window=2):
    data = []
    for sentence in corpus:
        for i, target in enumerate(sentence):
            for j in range(i - window, i + window + 1):
                if j != i and j >= 0 and j < len(sentence):
                    context = sentence[j]
                    data.append((target, context))
    return data

training_data_sg = generate_skipgram_data(tokenized_sentences)
print("Example Skip-Gram pair:", training_data_sg[0])

Example Skip-Gram pair: ('love', 'natural')


## Train Skip-Gram Model

In [13]:
# Initialize weights again for Skip-Gram
embedding_dim = 10
W1_sg = np.random.rand(vocab_size, embedding_dim)
W2_sg = np.random.rand(embedding_dim, vocab_size)

learning_rate = 0.01
epochs = 500

for epoch in range(epochs):
    loss = 0
    for target_word, context_word in training_data_sg:
        
        # 1️⃣ Input vector (target)
        x = one_hot(target_word, word2idx)

        # 2️⃣ Hidden layer
        h = np.dot(W1_sg.T, x)

        # 3️⃣ Output layer (Softmax)
        u = np.dot(W2_sg.T, h)
        y_pred = np.exp(u) / np.sum(np.exp(u))

        # 4️⃣ True output vector (context)
        y_true = one_hot(context_word, word2idx)

        # 5️⃣ Loss
        loss += -np.sum(y_true * np.log(y_pred + 1e-9))

        # 6️⃣ Backpropagation
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2_sg, e))

        # 7️⃣ Update weights
        W1_sg -= learning_rate * dW1
        W2_sg -= learning_rate * dW2

    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1} — Loss: {loss:.4f}")

Epoch 100 — Loss: 82.4831
Epoch 200 — Loss: 64.5488
Epoch 300 — Loss: 62.2612
Epoch 400 — Loss: 61.6073
Epoch 500 — Loss: 61.3477


## Get Word Embeddings

In [14]:
def get_vector_sg(word):
    return W1_sg[word2idx[word]]

print("\nSkip-Gram vector for 'python':")
print(get_vector_sg('python'))


Skip-Gram vector for 'python':
[ 0.8433671   1.06691425  0.21895965 -0.46480289  0.16026096 -0.42141261
  1.55643261  1.84405757 -0.42162315  0.3000474 ]


## Comparison CBOW VS Skip-Gram

In [15]:
print("CBOW:", get_vector('python'))
print("Skip-Gram:", get_vector_sg('python'))

CBOW: [ 1.64131423  0.95906346  1.29693459  0.877179   -0.62526354  0.48917783
 -0.15221688  0.46896776  0.9260571   0.5868147 ]
Skip-Gram: [ 0.8433671   1.06691425  0.21895965 -0.46480289  0.16026096 -0.42141261
  1.55643261  1.84405757 -0.42162315  0.3000474 ]
