In [1]:
# # To test model on custom RTE Pairs:
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model_path = "./bert-finetuned-model"

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSequenceClassification.from_pretrained(model_path)

# model.eval()

# def infer(texts, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
#     model.to(device)

#     # Tokenize input
#     encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     input_ids = encodings["input_ids"].to(device)
#     attention_mask = encodings["attention_mask"].to(device)

#     # Inference
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits

#     # Logits to probabilities
#     probs = torch.nn.functional.softmax(logits, dim=-1)
#     predictions = torch.argmax(probs, dim=-1)

#     id2label = {0: "entailment", 1: "contradiction", 2: "neutral"}
#     results = []
#     for i, (pred, prob) in enumerate(zip(predictions, probs)):
#         label = id2label[pred.item()]
#         confidence = prob[pred].item()
#         results.append({
#             "text": texts[i],
#             "label": label,
#             "confidence": confidence
#         })
    
#     return results

# # Example
# texts = [
#     "Premise: Lee kissed Kim . Hypothesis: Kim was kissed by Lee ."
# ]

# results = infer(texts, model, tokenizer)

# for res in results:
#     print(f"Text: {res['text']}")
#     print(f"Predicted Label: {res['label']} (Confidence: {res['confidence']:.4f})")
#     print("="*60)

Text: Premise: Lee kissed Kim . Hypothesis: Kim was kissed by Lee .
Predicted Label: entailment (Confidence: 0.3742)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, classification_report

model_path = "./bert-finetuned-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
test_df = pd.read_csv("test_transformed.csv")

X_test = list(test_df["text"])
y_test = list(test_df["label"])

print(f"Test samples: {len(X_test)}")
test_df.head()

Test samples: 9824


Unnamed: 0,text,label
0,Premise: This church choir sings to the masses...,2.0
1,Premise: This church choir sings to the masses...,0.0
2,Premise: This church choir sings to the masses...,1.0
3,"Premise: A woman with a green headscarf , blue...",2.0
4,"Premise: A woman with a green headscarf , blue...",0.0


In [5]:
def infer(texts, model, tokenizer, device):
    # Tokenize input
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    # Inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Logits to probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predictions = torch.argmax(probs, dim=-1)

    return predictions.cpu().tolist(), probs.cpu().tolist()

In [8]:
batch_size = 32
all_predictions = []
all_probs = []

for i in range(0, len(X_test), batch_size):
    batch = X_test[i:i + batch_size]
    preds, probs = infer(batch, model, tokenizer, device)
    
    all_predictions.extend(preds)
    all_probs.extend(probs)

id2label = {0: "entailment", 1: "contradiction", 2: "neutral"}
all_labels = [id2label[pred] for pred in all_predictions]

In [11]:
accuracy = accuracy_score(y_test, all_predictions)
f1 = f1_score(y_test, all_predictions, average="macro")

print("="*60)
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {f1:.4f}")
print("="*60)

report = classification_report(y_test, all_predictions, target_names=["entailment", "contradiction", "neutral"])
print(report)

Accuracy: 0.6403
Macro F1 Score: 0.6378
               precision    recall  f1-score   support

   entailment       0.65      0.73      0.69      3368
contradiction       0.66      0.62      0.64      3237
      neutral       0.61      0.57      0.59      3219

     accuracy                           0.64      9824
    macro avg       0.64      0.64      0.64      9824
 weighted avg       0.64      0.64      0.64      9824



In [10]:
output_df = pd.DataFrame({
    "text": X_test,
    "true_label": y_test,
    "predicted_label": all_predictions,
    "confidence": [max(prob) for prob in all_probs]
})

output_file = "bert_test_inference_results.csv"
output_df.to_csv(output_file, index=False)

print(f"Inference results saved to {output_file}")

Inference results saved to bert_test_inference_results.csv
