In [None]:
import re
from collections import Counter
from itertools import chain
import numpy as np
import PyPDF2

# Tokenize sentences into words
def tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return text.split()

# Create context-target pairs
def generate_pairs(tokens, window_size=2):
    pairs = []
    for idx, target in enumerate(tokens):
        start = max(idx - window_size, 0)
        end = min(idx + window_size + 1, len(tokens))
        context_words = tokens[start:idx] + tokens[idx + 1:end]
        for context in context_words:
            pairs.append((target, context))
    return pairs

# Reading the text data
file_name = "data.pdf"
pdf_reader = PyPDF2.PdfReader(file_name)
text = ""
for page in pdf_reader.pages[50:60]:
    extracted_text = page.extract_text()
    if extracted_text:
        text += extracted_text + "\n"

# Tokenization
tokens = tokenize(text)

# Function call to make the pairs
pairs = generate_pairs(tokens)

In [2]:
len(tokens)

3894

In [3]:
# Build vocabulary
vocab = Counter(chain.from_iterable([tokens]))
word_to_id = {word: idx for idx, word in enumerate(vocab.keys())}
id_to_word = {idx: word for word, idx in word_to_id.items()}
vocab_size = len(vocab)

# Convert pairs to numerical form
pairs_numeric = [(word_to_id[target], word_to_id[context]) for target, context in pairs]
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 1029


In [None]:
import torch
import torch.nn as nn

# Embedding model
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    # Forward Propagation
    def forward(self, target):
        embedding = self.embeddings(target)
        output = self.out_layer(embedding)
        return output


In [None]:
import torch.optim as optim

# Hyperparameters
embedding_dim = 50
learning_rate = 0.0001
epochs = 50

# Initialize model, loss, and optimizer
model = EmbeddingModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train_model(model, pairs, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for target, context in pairs:
            target_tensor = torch.tensor([target], dtype=torch.long)
            context_tensor = torch.tensor([context], dtype=torch.long)

            # Forward pass
            optimizer.zero_grad()
            output = model(context_tensor)

            # Calculate loss and backpropagate
            loss = criterion(output, target_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

train_model(model, pairs_numeric, epochs)


Epoch 1/50, Loss: 105587.0455
Epoch 2/50, Loss: 97456.8740
Epoch 3/50, Loss: 93766.3083
Epoch 4/50, Loss: 91806.2672
Epoch 5/50, Loss: 90760.4046
Epoch 6/50, Loss: 90223.1306
Epoch 7/50, Loss: 89976.7303
Epoch 8/50, Loss: 89905.8995
Epoch 9/50, Loss: 89949.7832
Epoch 10/50, Loss: 90076.3583
Epoch 11/50, Loss: 90267.1514
Epoch 12/50, Loss: 90507.6830
Epoch 13/50, Loss: 90780.9703
Epoch 14/50, Loss: 91063.2641
Epoch 15/50, Loss: 91326.4242
Epoch 16/50, Loss: 91545.8362
Epoch 17/50, Loss: 91707.5864
Epoch 18/50, Loss: 91809.4095
Epoch 19/50, Loss: 91857.4889
Epoch 20/50, Loss: 91861.8010
Epoch 21/50, Loss: 91832.5469
Epoch 22/50, Loss: 91778.2503
Epoch 23/50, Loss: 91705.6979
Epoch 24/50, Loss: 91620.5794
Epoch 25/50, Loss: 91526.4861
Epoch 26/50, Loss: 91425.9599
Epoch 27/50, Loss: 91320.7608
Epoch 28/50, Loss: 91212.2397
Epoch 29/50, Loss: 91101.4582
Epoch 30/50, Loss: 90989.2561
Epoch 31/50, Loss: 90876.2441
Epoch 32/50, Loss: 90762.8970
Epoch 33/50, Loss: 90649.8153
Epoch 34/50, Loss:

In [None]:
# Save embeddings to a dictionary
embeddings = {id_to_word[idx]: model.embeddings.weight.data[idx].numpy() for idx in range(vocab_size)}

# Saving the model in a pickle file
import pickle
with open("custom_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved!")


Embeddings saved!


In [None]:
# Load embeddings
with open("custom_embeddings.pkl", "rb") as f:
    loaded_embeddings = pickle.load(f)

# Test: Get the embedding for a word
word = "learn"
embedding = loaded_embeddings.get(word, None)
if embedding is not None:
    print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{word}' not in vocabulary.")

Embedding for 'learn': [ 0.98462373 -1.358865   -1.2448189  -0.5340973   0.9747564   0.1968791
  0.45238492 -1.8867493   1.1169325  -0.3150388   0.51736796  0.38134807
 -0.03910092  1.3536413   0.80729246 -0.08366877  0.41622758 -0.2955528
  0.8528182  -0.30366492  1.8349255   0.03945584 -2.5259895  -0.8204535
  0.84046316  0.51380336  0.5347724  -0.2826849  -1.1470565  -2.7635102
 -0.6396008   0.53561383  0.27772895 -0.08337791 -1.9681256   0.39585254
 -0.13285264  0.4746073  -1.6278913  -2.314853   -1.6916685   1.3804845
  0.85183036  0.42107642 -0.3282073   1.8663498  -1.2269049   0.29727605
  0.33903345  0.5911918 ]


### Evaluation of the model ###

In [None]:
from custom_embeddings import CustomEmbeddings
from sentence_transformers import util

# Loading the custom trained embeddings model
model = CustomEmbeddings()  

# Defining two sentences to calculate the co-sine similairty
sentence1 = "the supply of data for training and testing will be limited"
sentence2 = "The availability of data for model training and evaluation will be constrained"

# Generate embeddings for the sentences
embedding1 = model._embed_text(sentence1)
embedding2 = model._embed_text(sentence2)

# Compute the cosine similarity between the embeddings
similarity = util.cos_sim(embedding1, embedding2)

print(f"Semantic Similarity: {similarity.item():.4f}")


Semantic Similarity: 0.9189


In [None]:
# Defining two sentences to calculate the co-sine similairty
sentence1 = "we consider a synthetically generated data set representing measurements taken from a pipeline containing a mixture of oil, water, and gas"
sentence2 = "We use a simulated dataset representing readings from a pipeline carrying a blend of oil, water, and gas."

# Generate embeddings for the sentences
embedding1 = model._embed_text(sentence1)
embedding2 = model._embed_text(sentence2)

# Compute the cosine similarity between the embeddings
similarity = util.cos_sim(embedding1, embedding2)

print(f"Semantic Similarity: {similarity.item():.4f}")

Semantic Similarity: 0.8597


In [None]:
# Defining two sentences to calculate the co-sine similairty
sentence1 = "Due to the complex relationships between the object position or orientation and the pixel intensities, this manifold will be highly nonlinear"
sentence2 = "how to make an omlette?"

# Generate embeddings for the sentences
embedding1 = model._embed_text(sentence1)
embedding2 = model._embed_text(sentence2)

# Compute the cosine similarity between the embeddings
similarity = util.cos_sim(embedding1, embedding2)

print(f"Semantic Similarity: {similarity.item():.4f}")

Semantic Similarity: 0.2487


The above two sentences are not similar to each other and therefore has a very less similarity score. Whereas the previous pairs of sentences are similar to each other and thus have high similarity score.

In [None]:
import PyPDF2
import nltk

# Download the punkt tokenizer models
nltk.download('punkt')
nltk.download('punkt_tab')
import PyPDF2

# Helper function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages[50:60]:
            text += page.extract_text()
    return text

# Extracting the sentences from the text
def extract_sentences_from_text(text):
    """Tokenizes text into sentences using nltk"""
    sentences = nltk.sent_tokenize(text)
    return sentences

# Path to PDF file
pdf_path = 'data.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)

# Extract sentences from the text
sentences = extract_sentences_from_text(text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mukeshjavvaji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mukeshjavvaji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from custom_embeddings import CustomEmbeddings

# Load the embedding model
model = CustomEmbeddings()

# Define candidate documents
candidate_docs = [
    "As we vary the location of the decision boundary, the combined areas of the blue and green regions remain constant. Examples of decision boundaries and decision regions will be presented in future chapters.",
    "Decision boundaries play a crucial role in classification tasks. The size of the red region changes as the boundary's location varies.",
    "A model with only one degree of freedom of variability outputs the object's orientation regardless of its position. This degree of freedom minimizes variability effectively.",
    "If the goal is to learn a model that can take an input image and output the orientation of the object irrespective of its position, there is only one degree of freedom of variability.",
]

# Ground truth dataset
queries = ["when will we encounter decision boundaries?", "when will there be only one degree offreedom of variability?"]
relevant_docs = [
    ["As we vary the location bx of the decision boundary, the combined areas of the blue and green regions remains constant.",
     "We shall encounter examples of decision boundaries in later chapters."],
    ["If the goal is to learn a model that can take an input image and output the orientation of the object irrespective of its position, then there is only one degree of freedom of variability."]
]

# Encode candidate documents
candidate_embeddings = model.embed_documents(candidate_docs)

# Evaluation
precision_list, recall_list = [], []
top_k = 2

for query, relevant in zip(queries, relevant_docs):
    # Encode query
    query_embedding = model._embed_text(query)
    
    # Compute cosine similarity
    cosine_scores = util.cos_sim(query_embedding, candidate_embeddings)[0]
    
    # Rank candidates by similarity
    top_results = np.argsort(cosine_scores.cpu().numpy())[::-1][:top_k]
    
    # Check for relevance
    retrieved_docs = [candidate_docs[i] for i in top_results]
    relevant_retrieved = [doc for doc in retrieved_docs if any(util.cos_sim(model._embed_text(doc), model._embed_text(rel))[0] > 0.8 for rel in relevant)]
    
    # Precision and Recall
    precision = len(relevant_retrieved) / len(retrieved_docs)
    recall = len(relevant_retrieved) / len(relevant)
    
    precision_list.append(precision)
    recall_list.append(recall)
    
    print(f"Query: {query}")
    print(f"Retrieved: {retrieved_docs}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}\n")

# Average Precision and Recall
avg_precision = sum(precision_list) / len(precision_list)
avg_recall = sum(recall_list) / len(recall_list)
print(f"Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}")

Query: when will we encounter decision boundaries?
Retrieved: ['As we vary the location of the decision boundary, the combined areas of the blue and green regions remain constant. Examples of decision boundaries and decision regions will be presented in future chapters.', "Decision boundaries play a crucial role in classification tasks. The size of the red region changes as the boundary's location varies."]
Precision: 0.5000, Recall: 0.5000

Query: when will there be only one degree offreedom of variability?
Retrieved: ['If the goal is to learn a model that can take an input image and output the orientation of the object irrespective of its position, there is only one degree of freedom of variability.', "A model with only one degree of freedom of variability outputs the object's orientation regardless of its position. This degree of freedom minimizes variability effectively."]
Precision: 0.5000, Recall: 1.0000

Average Precision: 0.5000, Average Recall: 0.7500


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity

# Sample function to compute precision and recall
def compute_precision_recall(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    return precision, recall

# Function to compute cosine similarity between two sets of embeddings
def calculate_similarity(query_embedding, document_embeddings):
    similarities = cosine_similarity(query_embedding, document_embeddings)
    return similarities.flatten()

# K-Fold Cross-Validation
def k_fold_cross_validation(data, k=5):
    model = CustomEmbeddings()
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    precision_list = []
    recall_list = []

    for train_idx, test_idx in kf.split(data):
        # Split into training and test sets
        train_data = [data[i] for i in train_idx]
        test_data = [data[i] for i in test_idx]
        
        # Embedding for training and test queries/documents
        train_embeddings = [model.embed_documents(doc) for doc, _ in train_data]
        test_embeddings = [model.embed_documents(doc) for doc, _ in test_data]
        
        # Extracting queries and relevant documents for precision/recall calculation
        y_true = []  
        y_pred = []
        
        for query, relevant_docs in test_data:
            query_embedding = model._embed_text(query)
            
            # Calculate similarities between query and document embeddings
            similarities = calculate_similarity(query_embedding, train_embeddings)
            
            # Ranking documents based on similarity
            top_docs_idx = np.argsort(similarities)[::-1]  # Sorting in descending order
            
            # Define relevance (binary classification) based on top documents
            for idx in top_docs_idx:
                if relevant_docs == train_data[idx][0]:  # Check relevance to the correct document
                    y_true.append(1)
                    y_pred.append(1)
                else:
                    y_true.append(0)
                    y_pred.append(0)

        # Calculate precision and recall for the fold
        precision, recall = compute_precision_recall(y_true, y_pred)
        precision_list.append(precision)
        recall_list.append(recall)
    
    # Compute the average precision and recall across all folds
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    return avg_precision, avg_recall

# Example function to get embeddings (replace with your own model's embedding generation)
def get_embeddings(text):
    return np.random.randn(300)

# Sample data for k-fold cross-validation
data = [
    ("The supply of data for training and testing will be limited", "training"),
    ("The manifold will be highly nonlinear", "mathematics"),
    ("Model evaluation is essential for the research", "research"),
    ("Understanding data distributions is crucial", "statistics"),
    ("The training set must be diverse", "data science"),
    ("Proper test set handling improves model performance", "machine learning")
]

# Run k-fold cross-validation
avg_precision, avg_recall = k_fold_cross_validation(data, k=5)

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")


Average Precision: 0.8945
Average Recall: 0.8792
