In [61]:
import re
from collections import Counter
from itertools import chain
import numpy as np
import PyPDF2

# Tokenize sentences into words
def tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())  # Clean text
    return text.split()

# Create context-target pairs
def generate_pairs(tokens, window_size=2):
    pairs = []
    for idx, target in enumerate(tokens):
        start = max(idx - window_size, 0)
        end = min(idx + window_size + 1, len(tokens))
        context_words = tokens[start:idx] + tokens[idx + 1:end]
        for context in context_words:
            pairs.append((target, context))
    return pairs

file = "textbook.pdf"
pdf_reader = PyPDF2.PdfReader(file)
text = "`"
for page in pdf_reader.pages[50:80]:
    extracted_text = page.extract_text()
    if extracted_text:
        text += extracted_text + "\n"
# Example usage
# text = "The quick brown fox jumps over the lazy dog"
tokens = tokenize(text)
pairs = generate_pairs(tokens)
# print(pairs)


In [62]:
len(tokens)

10917

In [63]:
# Build vocabulary
vocab = Counter(chain.from_iterable([tokens]))
word_to_id = {word: idx for idx, word in enumerate(vocab.keys())}
id_to_word = {idx: word for word, idx in word_to_id.items()}
vocab_size = len(vocab)

# Convert pairs to numerical form
pairs_numeric = [(word_to_id[target], word_to_id[context]) for target, context in pairs]
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 2218


In [64]:
import torch
import torch.nn as nn

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target):
        embedding = self.embeddings(target)
        output = self.out_layer(embedding)
        return output


In [65]:
import torch.optim as optim

# Hyperparameters
embedding_dim = 200
learning_rate = 0.005
epochs = 20

# Initialize model, loss, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train_model(model, pairs, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for target, context in pairs:
            target_tensor = torch.tensor([target], dtype=torch.long)
            context_tensor = torch.tensor([context], dtype=torch.long)

            # Forward pass
            optimizer.zero_grad()
            output = model(target_tensor)

            # Calculate loss and backpropagate
            loss = criterion(output, context_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

train_model(model, pairs_numeric, epochs)


Epoch 1/20, Loss: 443479.5322
Epoch 2/20, Loss: 415342.4925
Epoch 3/20, Loss: 391997.7673
Epoch 4/20, Loss: 381237.1948
Epoch 5/20, Loss: 373264.4342
Epoch 6/20, Loss: 369541.2598


KeyboardInterrupt: 

In [None]:
# Save embeddings to a dictionary
embeddings = {id_to_word[idx]: model.embeddings.weight.data[idx].numpy() for idx in range(vocab_size)}

# Save to a file
import pickle
with open("custom_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved!")


Embeddings saved!


In [None]:
# Load embeddings
with open("custom_embeddings.pkl", "rb") as f:
    loaded_embeddings = pickle.load(f)

# Example: Get the embedding for a word
word = "quick"
embedding = loaded_embeddings.get(word, None)
if embedding is not None:
    print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{word}' not in vocabulary.")

Word 'quick' not in vocabulary.


In [None]:
import numpy as np

class CustomLangChainEmbeddings:
    def __init__(self, embedding_path="custom_embeddings.pkl"):
        # Load pre-trained embeddings
        with open(embedding_path, "rb") as f:
            self.embeddings = pickle.load(f)

    def _embed_text(self, text):
        """
        Generate an embedding for a given text by averaging embeddings of words in the text.

        Args:
            text (str): Input text.

        Returns:
            List[float]: Text embedding as a list.
        """
        tokens = text.lower().split()
        word_embeddings = [self.embeddings[token] for token in tokens if token in self.embeddings]
        if not word_embeddings:  # Handle out-of-vocabulary case
            return np.zeros(len(next(iter(self.embeddings.values())))).tolist()
        return np.mean(word_embeddings, axis=0).tolist()

    def embed_documents(self, texts):
        """
        Embed a list of documents using custom embeddings.

        Args:
            texts (List[str]): List of documents to embed.

        Returns:
            List[List[float]]: List of embeddings.
        """
        return [self._embed_text(text) for text in texts]

    def embed_query(self, text):
        """
        Embed a single query using custom embeddings.

        Args:
            text (str): Query text.

        Returns:
            List[float]: Query embedding.
        """
        return self._embed_text(text)
