In [None]:
import re
from collections import Counter
from itertools import chain
import numpy as np
import PyPDF2

# Tokenize sentences into words
def tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return text.split()

# Create context-target pairs
def generate_pairs(tokens, window_size=2):
    pairs = []
    for idx, target in enumerate(tokens):
        start = max(idx - window_size, 0)
        end = min(idx + window_size + 1, len(tokens))
        context_words = tokens[start:idx] + tokens[idx + 1:end]
        for context in context_words:
            pairs.append((target, context))
    return pairs

# Reading the text data
file_name = "data.pdf"
pdf_reader = PyPDF2.PdfReader(file_name)
text = ""
for page in pdf_reader.pages[50:60]:
    extracted_text = page.extract_text()
    if extracted_text:
        text += extracted_text + "\n"

# Tokenization
tokens = tokenize(text)

# Function call to make the pairs
pairs = generate_pairs(tokens)

In [2]:
len(tokens)

3894

In [3]:
# Build vocabulary
vocab = Counter(chain.from_iterable([tokens]))
word_to_id = {word: idx for idx, word in enumerate(vocab.keys())}
id_to_word = {idx: word for word, idx in word_to_id.items()}
vocab_size = len(vocab)

# Convert pairs to numerical form
pairs_numeric = [(word_to_id[target], word_to_id[context]) for target, context in pairs]
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 1029


In [None]:
import torch
import torch.nn as nn

# Embedding model
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    # Forward Propagation
    def forward(self, target):
        embedding = self.embeddings(target)
        output = self.out_layer(embedding)
        return output


In [None]:
import torch.optim as optim

# Hyperparameters
embedding_dim = 50
learning_rate = 0.0001
epochs = 50

# Initialize model, loss, and optimizer
model = EmbeddingModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train_model(model, pairs, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for target, context in pairs:
            target_tensor = torch.tensor([target], dtype=torch.long)
            context_tensor = torch.tensor([context], dtype=torch.long)

            # Forward pass
            optimizer.zero_grad()
            output = model(target_tensor)

            # Calculate loss and backpropagate
            loss = criterion(output, context_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

train_model(model, pairs_numeric, epochs)


Epoch 1/50, Loss: 105365.0471
Epoch 2/50, Loss: 97278.7964
Epoch 3/50, Loss: 93784.7069
Epoch 4/50, Loss: 91877.7365
Epoch 5/50, Loss: 90779.3199
Epoch 6/50, Loss: 90173.7671
Epoch 7/50, Loss: 89885.8964
Epoch 8/50, Loss: 89807.4895
Epoch 9/50, Loss: 89872.7955
Epoch 10/50, Loss: 90040.5818
Epoch 11/50, Loss: 90278.9122
Epoch 12/50, Loss: 90558.7916
Epoch 13/50, Loss: 90851.3728
Epoch 14/50, Loss: 91129.6333
Epoch 15/50, Loss: 91370.4689
Epoch 16/50, Loss: 91558.8044
Epoch 17/50, Loss: 91689.3869
Epoch 18/50, Loss: 91765.4392
Epoch 19/50, Loss: 91795.3631
Epoch 20/50, Loss: 91788.7716
Epoch 21/50, Loss: 91754.5881
Epoch 22/50, Loss: 91699.9562
Epoch 23/50, Loss: 91630.2152
Epoch 24/50, Loss: 91550.0200
Epoch 25/50, Loss: 91462.1713
Epoch 26/50, Loss: 91369.1791
Epoch 27/50, Loss: 91272.6315
Epoch 28/50, Loss: 91173.6447
Epoch 29/50, Loss: 91073.0866
Epoch 30/50, Loss: 90971.5203
Epoch 31/50, Loss: 90869.3826
Epoch 32/50, Loss: 90766.9158
Epoch 33/50, Loss: 90664.4587
Epoch 34/50, Loss:

In [None]:
# Save embeddings to a dictionary
embeddings = {id_to_word[idx]: model.embeddings.weight.data[idx].numpy() for idx in range(vocab_size)}

# Saving the model in a pickle file
import pickle
with open("custom_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved!")


Embeddings saved!


In [None]:
# Load embeddings
with open("custom_embeddings.pkl", "rb") as f:
    loaded_embeddings = pickle.load(f)

# Test: Get the embedding for a word
word = "learn"
embedding = loaded_embeddings.get(word, None)
if embedding is not None:
    print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{word}' not in vocabulary.")

Embedding for 'learn': [ 0.98462373 -1.358865   -1.2448189  -0.5340973   0.9747564   0.1968791
  0.45238492 -1.8867493   1.1169325  -0.3150388   0.51736796  0.38134807
 -0.03910092  1.3536413   0.80729246 -0.08366877  0.41622758 -0.2955528
  0.8528182  -0.30366492  1.8349255   0.03945584 -2.5259895  -0.8204535
  0.84046316  0.51380336  0.5347724  -0.2826849  -1.1470565  -2.7635102
 -0.6396008   0.53561383  0.27772895 -0.08337791 -1.9681256   0.39585254
 -0.13285264  0.4746073  -1.6278913  -2.314853   -1.6916685   1.3804845
  0.85183036  0.42107642 -0.3282073   1.8663498  -1.2269049   0.29727605
  0.33903345  0.5911918 ]
