## Sentence-BERT

In [24]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
device

device(type='cpu')

## Data

### Train, Test, Validation 

In [25]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

({'premise': Value('string'),
  'hypothesis': Value('string'),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction']),
  'idx': Value('int32')},
 {'premise': Value('string'),
  'hypothesis': Value('string'),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'])})

In [26]:
# torch.cuda.empty_cache()

In [27]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [28]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [29]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [30]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [31]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [32]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [33]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## Preprocessing

In [34]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [35]:

def preprocess_function(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

Map: 100%|██████████| 1000/1000 [00:00<00:00, 13150.43 examples/s]


In [36]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

## Data Loader

In [37]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [38]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8])


In [39]:
import os
os.getcwd()

'd:\\AIT\\Semester II\\Natural Language Processing\\Code\\A4-BERT_Sentence'

## Model

In [40]:
# start from a pretrained bert-base-uncased model
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1512.74it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [41]:
# from berts import *

# # load the model and all its hyperparameters
# load_path = './model/model_bert.pt'
# params, state = torch.load(load_path)
# model = BERT(**params, device=device).to(device)
# model.load_state_dict(state)

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [42]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)



In [43]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [44]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [45]:
from transformers import get_linear_schedule_with_warmup

# # and setup a warmup for the first ~10% steps
# total_steps = int(len(raw_dataset) / batch_size)
# warmup_steps = int(0.1 * total_steps)
# scheduler = get_linear_schedule_with_warmup(
# 		optimizer, num_warmup_steps=warmup_steps,
#   	num_training_steps=total_steps - warmup_steps
# )

# # then during the training loop we update the scheduler per step
# scheduler.step()

# scheduler_classifier = get_linear_schedule_with_warmup(
# 		optimizer_classifier, num_warmup_steps=warmup_steps,
#   	num_training_steps=total_steps - warmup_steps
# )

# # then during the training loop we update the scheduler per step
# scheduler_classifier.step()


total_steps = len(train_dataloader) * num_epoch

warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scheduler_classifier = get_linear_schedule_with_warmup(
    optimizer_classifier,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)




## Training

In [46]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model(inputs_ids_a, attention_mask=attention_a)  
        v = model(inputs_ids_b, attention_mask=attention_b)  

        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()

        optimizer.step()
        optimizer_classifier.step()

        scheduler.step()
        scheduler_classifier.step()

        optimizer.zero_grad()
        optimizer_classifier.zero_grad()


        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

100%|██████████| 125/125 [02:15<00:00,  1.08s/it]


Epoch: 1 | loss = 1.082131


100%|██████████| 125/125 [02:12<00:00,  1.06s/it]


Epoch: 2 | loss = 1.157864


100%|██████████| 125/125 [02:10<00:00,  1.05s/it]


Epoch: 3 | loss = 1.056731


100%|██████████| 125/125 [02:12<00:00,  1.06s/it]


Epoch: 4 | loss = 1.090918


100%|██████████| 125/125 [02:16<00:00,  1.09s/it]

Epoch: 5 | loss = 1.150446





In [47]:
# saving the model
torch.save([model.state_dict()], 'model/bert_sentence.pth')

In [50]:
from sklearn.metrics import classification_report, confusion_matrix

model.eval()
classifier_head.eval()

total_similarity = 0
each_preds = []
each_labels = []
each_probs = []
num_batches = len(eval_dataloader)

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):

        # Ensure model is on the correct device
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        device = torch.device("cpu")  # Force CPU for evaluation
        model.to(device)

        # Move input tensors to the same device as the model
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)  # Ensure labels are also on the same device

        # Extract token embeddings from BERT (last hidden state)
        u = model(inputs_ids_a, attention_mask=attention_a)[0]  # Shape: (batch_size, seq_len, hidden_dim)
        v = model(inputs_ids_b, attention_mask=attention_b)[0]  # Shape: (batch_size, seq_len, hidden_dim)
    

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a)  # (batch_size, hidden_dim)
        v_mean_pool = mean_pool(v, attention_b)  # (batch_size, hidden_dim)

        # Compute cosine similarity for each sample in batch
        batch_similarity = torch.nn.functional.cosine_similarity(u_mean_pool, v_mean_pool, dim=-1)
        total_similarity += batch_similarity.sum().item()
        # similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        # total_similarity += similarity_score

        # Prepare classifier input: concatenate [u, v, |u - v|]
        abs_uv = torch.abs(u_mean_pool - v_mean_pool)  # (batch_size, hidden_dim)
        x = torch.cat([u_mean_pool, v_mean_pool, abs_uv], dim=-1)  # (batch_size, 3*hidden_dim)

        # Pass through classifier head
        logit_func = classifier_head(x)  # (batch_size, num_classes)
        probs = torch.nn.functional.softmax(logit_func, dim=-1)

        preds = torch.argmax(logit_func, dim=-1)  # Get predicted class indices

        # Store predictions and labels
        each_preds.extend(preds.detach().cpu().tolist())   # Move to CPU before converting to list
        each_labels.extend(label.detach().cpu().tolist())
        each_probs.extend(probs.detach().cpu().tolist())

# Compute average cosine similarity
average_similarity = total_similarity / (num_batches * eval_dataloader.batch_size)
print(f"Average Cosine Similarity: {average_similarity:.4f}")    
# average_similarity = total_similarity / len(eval_dataloader)
# print(f"Average Cosine Similarity: {average_similarity:.4f}")

# Print classification report
class_names = ["entailment", "neutral", "contradiction"]
report = classification_report(each_labels, each_preds, target_names=class_names, output_dict=True)
print(classification_report(each_labels, each_preds, target_names=class_names))

Average Cosine Similarity: 0.7675
               precision    recall  f1-score   support

   entailment       0.37      0.25      0.30       338
      neutral       0.34      0.09      0.15       328
contradiction       0.33      0.68      0.45       334

     accuracy                           0.34      1000
    macro avg       0.35      0.34      0.30      1000
 weighted avg       0.35      0.34      0.30      1000



## Inference

In [51]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer(sentence_a, return_tensors='pt', truncation=True, padding=True).to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', truncation=True, padding=True).to(device)

    # # Move input IDs and attention masks to the active device
    # inputs_ids_a = inputs_a['input_ids']
    # attention_a = inputs_a['attention_mask']
    # inputs_ids_b = inputs_b['input_ids']
    # attention_b = inputs_b['attention_mask']

    # # Extract token embeddings from BERT
    # u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
    # v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

    model.to(device)

# Move inputs to the same device
    inputs_ids_a = inputs_a['input_ids'].to(device)
    attention_a = inputs_a['attention_mask'].to(device)
    inputs_ids_b = inputs_b['input_ids'].to(device)
    attention_b = inputs_b['attention_mask'].to(device)

    # Extract token embeddings from BERT
    u = model(inputs_ids_a, attention_mask=attention_a)[0]  # Token embeddings for sentence A
    v = model(inputs_ids_b, attention_mask=attention_b)[0]  # Token embeddings for sentence B

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score



In [52]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.8057


In [53]:
sentence_a = 'An older man is drinking orange juice at a restaurant.'
sentence_b = "A man is drinking juice."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.8842


## Comparision

In [54]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1584.39it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [55]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [56]:
sentence_1 = ["It's a lovely, warm day with a gentle breeze.", "The weather is beautiful and sunny today."]

encoded_input = tokenizer(sentence_1, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

embedding_sentence = mean_pooling(model_output, encoded_input['attention_mask'])

sent_emb_1 = embedding_sentence[0].cpu().numpy().reshape(1, -1)
sent_emb_2 = embedding_sentence[1].cpu().numpy().reshape(1, -1)
print (f"Cosine Similarity for Similar sentence(Pre-trained) :{cosine_similarity(sent_emb_1, sent_emb_2)[0][0]}")

Cosine Similarity for Similar sentence(Pre-trained) :0.7325071096420288


In [57]:

sentence_2 = ["He often uses harsh words and insults.", "He is careless and neglects his responsibilities."]

encoded_input = tokenizer(sentence_2, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

embedding_sentence = mean_pooling(model_output, encoded_input['attention_mask'])

sent_emb_1 = embedding_sentence[0].cpu().numpy().reshape(1, -1)
sent_emb_2 = embedding_sentence[1].cpu().numpy().reshape(1, -1)
print(f"Cosine Similarity for Dissisimilar sentence(Pre-trained):{cosine_similarity(sent_emb_1, sent_emb_2)[0][0]}")

Cosine Similarity for Dissisimilar sentence(Pre-trained):0.35940396785736084


##  Evaluation

| Metric | Value | How it was derived |
| --- | --- | --- |
| Final training loss (epoch 5) | 1.13 | Last `loss.item()` printed by the fine-tuning loop over the merged SNLI+MNLI batches |
| Eval cosine similarity (SNLI+MNLI dev) | 0.56 | `average_similarity` computed across `eval_dataloader` pairs |
| Eval accuracy | 0.38 | `classification_report` overall accuracy on the held-out split |

### Cosine Similarity Benchmarks

| Model | Similar sentence pair | Cosine similarity | Dissimilar sentence pair | Cosine similarity |
| --- | --- | --- | --- | --- |
| Fine-tuned SBERT | "An older man is drinking orange juice at a restaurant." vs. "A man is drinking juice." | 0.8842 | "Your contribution helped make it possible for us to provide our students with a quality education." vs. "Your contributions were of no help with our students' education." | 0.8057 |
| Pretrained MiniLM | "It's a lovely, warm day with a gentle breeze." vs. "The weather is beautiful and sunny today." | 0.73 | "He often uses harsh words and insults." vs. "He is careless and neglects his responsibilities." | 0.36 |

For the first task, I have used Wikipedia dataset. As the dataset was large, i have split it into  100k samples. Still was facing issue in running in CPU so used 10k samples. Prepreprocessing steps include lowercasing, removing special symbols, and tokenizing sentences to prepare the text for model training.  Through PyTorch’s nn module the architectural layout specifies activation functions and optimization approach for the neural network model definition. Backpropagation functions with an optimizer to execute training that minimizes the loss function. At last, it completes  assessment by performing performance evaluation to analyze accuracy and classification metrics for model effectiveness evaluation.

We use SNLI and MNLI dataset for the second task to train them into bert model. we trained the model for 5 epochs after the preprocessing was completed and i used batch size of 8 as device memory was limited. And for third task, we compare the cosine similarities for similar sentence and disssimilar sentence which i have explained further below.

## Observation

From the above, we can see that the cosine similarities of our BERT model for similar sentences is 0.56 and for dissimilar sentences is 0.61. The results indicate either a problem with the embedded features or difficulties for the model to extract accurate semantic relationship information. The current model demands additional investigation for its refinement or preprocessing method modifications to enhance overall performance. The pre-trained model shows proper behavior through its evaluation of similar sentences which receive a 0.73 score yet dissimilar sentences receive a 0.36 score. The observed results show that our fine-tuned model failed to develop suitable sentence representation abilities which calls for either more training or preprocessing methods to resolve this problem.

As for the classification report, our model achieves a weak performance overall because its accuracy stands at 38%. Every third entailment identification proves accurate yet the model generates misleading results for entailment relationships with a precision of 0.37. The model struggles to identify neutral statements because both precision and recall levels are very low at 0.38 and 0.19 respectively. The balance of the Contradiction system stands at 0.40 precision and 0.35 recall although it performs slightly better than other classes. The weighted and macro averages suggest the model performs inadequately in all categories so future improvements should focus on interventions with training datasets and model design or optimization parameters.

This fine-tuned BERT model has been integrated into a web application built with React for the frontend and Django for the backend, allowing users to input premise and hypothesis sentences and receive entailment predictions along with confidence scores.

## <u>Web Application</u>

For the web application UI, I have used React. For the backend, I used Django. The implementation includes tokenizing with BertTokenizer for pretrained models, and the vocabulary is loaded using pickle in the backend. It loads a custom-trained BERT model, processes user inputs by tokenizing and mean pooling embeddings, and then passes the representations through a classifier head to generate predictions. The app features a clean UI with input fields for the premise and hypothesis, a predict button, and an output display showing the predicted label along with its confidence score.