In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
from sklearn.metrics import mean_squared_error
import pandas as pd
from tqdm import tqdm

In [None]:
# Download NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
bert_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
# Function to read and preprocess dataset from .txt file
def read_and_preprocess_txt(filepath):
    with open(filepath, 'r', encoding='utf-8-sig') as file:  # Use utf-8-sig to handle BOM
        lines = file.readlines()[1:]  # Skip the header line

    data = []
    for line in lines:
        line = line.strip().split('\t')
        quality = int(line[0])
        sentence1 = line[3]
        sentence2 = line[4]
        data.append({'Quality': quality, '#1 String': sentence1, '#2 String': sentence2})

    df = pd.DataFrame(data)
    df = preprocess_dataset(df)
    return df

In [None]:
# Define Dataset class
class SentenceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence1 = row['#1 String']
        sentence2 = row['#2 String']
        similarity_score = row['Quality']
        inputs = tokenizer(sentence1, sentence2, return_tensors="pt", padding=True, truncation=True)
        inputs.to(device)
        return inputs, similarity_score

In [None]:
# Define neural network model
class SentenceSimilarityModel(nn.Module):
    def __init__(self, bert_model):
        super(SentenceSimilarityModel, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(768, 1)

    def forward(self, inputs):
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        pooled_output = outputs.pooler_output
        similarity_score = torch.sigmoid(self.fc(pooled_output))
        return similarity_score

In [None]:
# Function to preprocess the dataset
def preprocess_dataset(data):
    data['Quality'] = data['Quality'].astype(float)
    return data

In [None]:
# Load and preprocess the dataset
train_data = read_and_preprocess_txt("/content/msr_paraphrase_train.txt")

In [None]:
# Create DataLoader for training with collate_fn
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    max_length = max(len(inputs[i]['input_ids'][0]) for i in range(len(inputs)))
    for i in range(len(inputs)):
        input_ids = inputs[i]['input_ids'][0]
        attention_mask = inputs[i]['attention_mask'][0]
        pad_length = max_length - len(input_ids)
        inputs[i]['input_ids'] = torch.cat((input_ids, torch.zeros(pad_length, dtype=torch.long)), dim=0)
        inputs[i]['attention_mask'] = torch.cat((attention_mask, torch.zeros(pad_length, dtype=torch.long)), dim=0)

    return {'input_ids': torch.stack([inputs[i]['input_ids'] for i in range(len(inputs))]),
            'attention_mask': torch.stack([inputs[i]['attention_mask'] for i in range(len(inputs))]),
            'labels': torch.tensor(labels)}

In [None]:
train_dataset = SentenceDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [None]:
# Define the model, loss function, and optimizer
model = SentenceSimilarityModel(bert_model)
model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model({'input_ids': inputs, 'attention_mask': attention_mask})
            loss = criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

In [None]:
# Train the model
train_model(model, train_loader, criterion, optimizer)

Epoch 1/10: 100%|██████████| 510/510 [18:00<00:00,  2.12s/it]


Epoch 1/10, Loss: 0.2181


Epoch 2/10: 100%|██████████| 510/510 [17:32<00:00,  2.06s/it]


Epoch 2/10, Loss: 0.2174


Epoch 3/10: 100%|██████████| 510/510 [17:34<00:00,  2.07s/it]


Epoch 3/10, Loss: 0.2161


Epoch 4/10: 100%|██████████| 510/510 [17:43<00:00,  2.08s/it]


Epoch 4/10, Loss: 0.2156


Epoch 5/10: 100%|██████████| 510/510 [17:46<00:00,  2.09s/it]


Epoch 5/10, Loss: 0.2142


Epoch 6/10: 100%|██████████| 510/510 [17:44<00:00,  2.09s/it]


Epoch 6/10, Loss: 0.2139


Epoch 7/10: 100%|██████████| 510/510 [17:44<00:00,  2.09s/it]


Epoch 7/10, Loss: 0.2131


Epoch 8/10: 100%|██████████| 510/510 [17:43<00:00,  2.09s/it]


Epoch 8/10, Loss: 0.2130


Epoch 9/10: 100%|██████████| 510/510 [17:37<00:00,  2.07s/it]


Epoch 9/10, Loss: 0.2123


Epoch 10/10: 100%|██████████| 510/510 [17:28<00:00,  2.06s/it]

Epoch 10/10, Loss: 0.2118





In [15]:
torch.save(model.state_dict(), "./similarity_model.pt")

In [30]:
# Evaluate the model

def evaluate_model(model, test_loader, threshold=0.5):
    model.eval()
    predictions = []
    labels = []
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].numpy()

            outputs = model(
                {'input_ids': inputs, 'attention_mask': attention_mask})
            predictions.extend(outputs.cpu().numpy())
            labels.extend(batch_labels)

            # Convert similarity scores to binary predictions based on the threshold
            binary_predictions = [
                1 if pred >= threshold else 0 for pred in outputs.cpu().numpy()]

            # Calculate accuracy
            correct_predictions += sum([1 if pred == label else 0 for pred,
                                       label in zip(binary_predictions, batch_labels)])
            total_samples += len(batch_labels)

    mse = mean_squared_error(labels, predictions)
    accuracy = correct_predictions / total_samples
    return mse, accuracy

In [31]:
# Load and preprocess the test dataset
test_data = read_and_preprocess_txt("msr_paraphrase_test.txt")

In [32]:
# Create DataLoader for testing
test_dataset = SentenceDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=8,shuffle=False, collate_fn=collate_fn)

In [36]:
# Load the trained model
model = SentenceSimilarityModel(bert_model)
# Load the saved model weights
model.load_state_dict(torch.load("similarity_model.pt"))
model.to(device)

SentenceSimilarityModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [37]:
# Evaluate the model
mse, accuracy = evaluate_model(model, test_loader)
print(f"Mean Squared Error on test set: {mse:.4f}")
print(f"Accuracy on test set: {accuracy:.4f}")

Evaluating: 100%|██████████| 216/216 [07:26<00:00,  2.07s/it]

Mean Squared Error on test set: 0.2162
Accuracy on test set: 0.6649





In [42]:
def tokenize_and_preprocess(sentence):
    tokens = word_tokenize(sentence)
    return " ".join(tokens)

In [43]:
# Function to compute similarity between two sentences
def compute_similarity(model, sentence1, sentence2):
    model.eval()
    sentence1 = tokenize_and_preprocess(sentence1)
    sentence2 = tokenize_and_preprocess(sentence2)
    if(sentence1==sentence2):
        return 1.00
    else :
        inputs = tokenizer(sentence1, sentence2, return_tensors="pt", padding=True, truncation=True)
        inputs.to(device)
        with torch.no_grad():
            outputs = model(inputs)
        similarity_score = outputs.item()
        return similarity_score

In [44]:
# Define a threshold for considering sentences as paraphrases
threshold = 0.55
# Function to determine if sentences are paraphrases based on similarity score
def are_paraphrases(sentence1, sentence2, threshold):

    similarity_score = compute_similarity(model, sentence1, sentence2)
    print("Similarity score between the sentences:", similarity_score)
    return similarity_score >= threshold

In [55]:
# Example 1 :

sentence1 = "Her laughter echoed through the halls, filling the room with joy and merriment.".strip()
sentence2 = "The sound of her laughter reverberated throughout the corridors, saturating the space with happiness and cheer.".strip()
print("Sentence 1 : ",sentence1)
print("Sentence 2 : ",sentence2)

Sentence 1 :  Her laughter echoed through the halls, filling the room with joy and merriment.
Sentence 2 :  The sound of her laughter reverberated throughout the corridors, saturating the space with happiness and cheer.


In [56]:
if are_paraphrases(sentence1, sentence2, threshold):
    print("The sentences are paraphrases")
else:
    print("The sentences are not paraphrases.")

Similarity score between the sentences: 0.7070431113243103
The sentences are paraphrases


In [57]:
# Example 2 :

sentence1 ="She pirouetted gracefully across the stage, her movements as fluid as a swan gliding on a tranquil lake."
sentence2 = "The ancient ruins stood stoically against the test of time, silent witnesses to the passage of centuries and civilizations."
print("Sentence 1 : ",sentence1)
print("Sentence 2 : ",sentence2)

Sentence 1 :  She pirouetted gracefully across the stage, her movements as fluid as a swan gliding on a tranquil lake.
Sentence 2 :  The ancient ruins stood stoically against the test of time, silent witnesses to the passage of centuries and civilizations.


In [58]:
if are_paraphrases(sentence1, sentence2, threshold):
    print("The sentences are paraphrases")
else:
    print("The sentences are not paraphrases.")

Similarity score between the sentences: 0.28974154591560364
The sentences are not paraphrases.
