In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import re
from torch.nn.functional import cosine_similarity

# Simple tokenizer function
def simple_tokenizer(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return text.split()

# Vocabulary class to handle word to index mapping
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

# Dataset class to load and preprocess text data
class TextDataset(Dataset):
    def __init__(self, dataset_split):
        self.pairs = []
        self.labels = []
        self.vocab = Vocabulary()
        self.max_len = 0  # Initialize max_len
        
        # Extract pairs and labels
        for i in range(len(dataset_split)):
            sentence1 = dataset_split[i]['sentence1']
            sentence2 = dataset_split[i]['sentence2']
            label = dataset_split[i]['label']
            self.pairs.append((sentence1, sentence2))
            self.labels.append(label)
            
            # Tokenize and add to vocab
            tokens1 = simple_tokenizer(sentence1)
            tokens2 = simple_tokenizer(sentence2)
            for token in tokens1 + tokens2:
                self.vocab.add_word(token)
            
            # Update max_len for consistent padding
            self.max_len = max(self.max_len, len(tokens1), len(tokens2))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        sentence1, sentence2 = self.pairs[idx]
        label = self.labels[idx]
        
        indices1 = [self.vocab.word2idx[token] for token in simple_tokenizer(sentence1) if token in self.vocab.word2idx]
        indices2 = [self.vocab.word2idx[token] for token in simple_tokenizer(sentence2) if token in self.vocab.word2idx]
        
        padded1 = indices1 + [0] * (self.max_len - len(indices1))
        padded2 = indices2 + [0] * (self.max_len - len(indices2))
        
        return torch.tensor(padded1), torch.tensor(padded2), torch.tensor(label, dtype=torch.float32)

# Model for training word embeddings
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_idx):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        mask = (inputs != 0).unsqueeze(-1).type(torch.float32)  # Create a mask for non-padding tokens
        embeds_masked = embeds * mask  # Apply the mask to zero out padding embeddings
        # Sum embeddings along the time dimension (dim=1) and divide by the number of non-zero elements in each sequence
        sentence_embeddings = embeds_masked.sum(dim=1) / mask.sum(dim=1)
        return sentence_embeddings


# Contrastive loss function
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

# Load dataset splits
dataset = load_dataset("PiC/phrase_similarity")
train_dataset = TextDataset(dataset['train'])
dev_dataset = TextDataset(dataset['validation'])

# Prepare DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=4, shuffle=False)

# Model and training setup
embedding_dim = 100
model = WordEmbeddingModel(len(train_dataset.vocab), embedding_dim, pad_idx=0)
optimizer = torch.optim.Adam(model.parameters())
criterion = ContrastiveLoss(margin=2.0)

# Training loop
for epoch in range(10):
    total_loss = 0
    for sentence1, sentence2, labels in train_loader:
        optimizer.zero_grad()
        output1 = model(sentence1)
        output2 = model(sentence2)
        loss = criterion(output1, output2, labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch [{epoch+1}/10], Loss: {total_loss / len(train_loader)}')


Epoch [1/10], Loss: 1.1264337411204588
Epoch [2/10], Loss: 1.0996047176339843
Epoch [3/10], Loss: 1.0719989789240296
Epoch [4/10], Loss: 1.0470329004141483
Epoch [5/10], Loss: 1.024531399332272
Epoch [6/10], Loss: 1.003704953625977
Epoch [7/10], Loss: 0.9846524847602245
Epoch [8/10], Loss: 0.9671675525073798
Epoch [9/10], Loss: 0.9506875607097306
Epoch [10/10], Loss: 0.9353960195587132


In [2]:
import torch
from torch.nn.functional import cosine_similarity

def get_sentence_embeddings(sentences, model, vocab, max_len=30):
    sentence_indices = []
    for sentence in sentences:
        tokens = simple_tokenizer(sentence)
        indices = [vocab.word2idx[token] for token in tokens if token in vocab.word2idx]
        padded_indices = indices + [0] * (max_len - len(indices))
        sentence_indices.append(padded_indices)
    
    sentence_tensor = torch.tensor(sentence_indices)
    model.eval()
    with torch.no_grad():
        embeddings = model(sentence_tensor)
    return embeddings

def compare_sentence_similarity(embedding1, embedding2):
    similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
    return similarity.item()



sentences = ["Disciplined student","The fruit is very tasty"]
#embedding dimensions
embeddings = get_sentence_embeddings(sentences, model, train_dataset.vocab)
print("Embedding shapes:", [e.shape for e in embeddings])

# If the embedding shapes are correct, compute similarity
if all(e.dim() == 1 for e in embeddings):
    similarity = compare_sentence_similarity(embeddings[0], embeddings[1])
    print(f"Similarity: {similarity}")
else:
    print("Error: Embeddings are not 1D vectors.")



Embedding shapes: [torch.Size([100]), torch.Size([100])]
Similarity: 0.04329507797956467


In [3]:
import torch
from torch.nn.functional import cosine_similarity

def get_sentence_embeddings(sentences, model, vocab, max_len=30):
    sentence_indices = []
    for sentence in sentences:
        tokens = simple_tokenizer(sentence)
        indices = [vocab.word2idx[token] for token in tokens if token in vocab.word2idx]
        padded_indices = indices + [0] * (max_len - len(indices))
        sentence_indices.append(padded_indices)
    
    sentence_tensor = torch.tensor(sentence_indices)
    model.eval()  
    with torch.no_grad():
        embeddings = model(sentence_tensor)
    return embeddings

def compare_sentence_similarity(embedding1, embedding2):
    similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
    return similarity.item()



sentences = ["A major criticism of litigation funding is that its cost is disproportionate to the risk accepted by litigation finance companies.", "A Harsh outspoken discussion of litigation funding is that its cost is disproportionate to the risk accepted by litigation finance companies."]

embeddings = get_sentence_embeddings(sentences, model, train_dataset.vocab)
print("Embedding shapes:", [e.shape for e in embeddings])

# If the embedding shapes are correct, compute similarity
if all(e.dim() == 1 for e in embeddings):
    similarity = compare_sentence_similarity(embeddings[0], embeddings[1])
    print(f"Similarity: {similarity}")
else:
    print("Error: Embeddings are not 1D vectors.")



Embedding shapes: [torch.Size([100]), torch.Size([100])]
Similarity: 0.9591876864433289
