In [42]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score

# Load the test dataset
df = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv')  # Make sure to update the path to your dataset location


In [43]:
df.head()

Unnamed: 0,uid,premise,hypothesis,label,reason
0,4aae63a8-fcf7-406c-a2f3-50c31c5934a9,Ernest Jones is a British jeweller and watchma...,The first Ernest Jones store was opened on the...,0,"The first store was opened in London, which is..."
1,c577b92c-78fb-4e1d-ae1d-34133609c142,Old Trafford is a football stadium in Old Traf...,There are only 10 larger football stadiums in ...,0,The text says that it is the 11th largest foot...
2,26936cd9-1a5a-4a2b-9fca-899d61880ca0,Magnus is a Belgian joint dance project of Tom...,"""The body gave you everything"" album was not r...",0,"it was released on March 29, 2004. ""not this b..."
3,cd977941-273b-4748-a5d2-6c7234a2a302,Shadowboxer is a 2005 crime thriller film dire...,Shadowboxer was written and directed by Lee Da...,1,It is not know who wrote the Shadowboxer. The ...
4,1a9eae8f-27d9-47ba-80b8-7d1402ee524a,"Takaaki Kajita (梶田 隆章 , Kajita Takaaki ) is a ...","Arthur B. McDonald is a Japanese physicist, kn...",2,Arthur B. McDonald is Canadian in the context.


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         1000 non-null   object
 1   premise     1000 non-null   object
 2   hypothesis  1000 non-null   object
 3   label       1000 non-null   int64 
 4   reason      1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [45]:
!pip install peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [46]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM

config = PeftConfig.from_pretrained("lorahub/flan_t5_large-anli_r1")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
model = PeftModel.from_pretrained(model, "lorahub/flan_t5_large-anli_r1")

In [47]:
pip install tqdm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [48]:
# Define the custom dataset
class ANLIDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        premise = item['premise']
        hypothesis = item['hypothesis']
        reason = item['reason']
        label = self.label_map[item['label']]
        # Format the input text in a way that's expected by the T5 model
        input_sequence = f"premise: {premise} hypothesis: {hypothesis} reason: {reason}"
        return input_sequence, label


In [49]:
dataset = ANLIDataset(df)

# DataLoader setup
loader = DataLoader(dataset, batch_size=32, shuffle=False)  


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
       

In [50]:
from tqdm import tqdm
# Evaluation loop with tqdm
predictions = []
labels = []

for batch in tqdm(loader, desc="Evaluating"):
    input_sequences, batch_labels = batch
    # Tokenize the inputs. Adjust as per your tokenizer's requirement
    inputs = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Move tensors to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Forward pass through the model, specifying max_new_tokens
        output_sequences = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50  # Adjust as needed
        )
        # Decode the generated sequences to text
        pred_texts = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in output_sequences]
    predictions.extend(pred_texts)
    labels.extend(batch_labels)



Evaluating: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]


In [51]:
# Example snippet to debug predictions
for i, (pred, label) in enumerate(zip(predictions, labels)):
    if i < 10:  # Just inspect the first few
        print(f"Prediction: {pred.strip().lower()}, Label: {label.strip().lower()}")

# This gives you a direct comparison for the first few predictions to see if there's a mismatch pattern.


Prediction: yes, Label: entailment
Prediction: yes, Label: entailment
Prediction: yes, Label: entailment
Prediction: no, Label: neutral
Prediction: no, Label: contradiction
Prediction: it's impossible to say, Label: neutral
Prediction: no, Label: neutral
Prediction: no, Label: contradiction
Prediction: no, Label: neutral
Prediction: yes, Label: contradiction


In [52]:
def map_prediction_to_label(prediction):
    mapping = {
        "yes": "entailment",
        "no": "contradiction",
        "it's impossible to say": "neutral"
    }
    return mapping.get(prediction.strip().lower(), "unknown")

mapped_predictions = [map_prediction_to_label(pred) for pred in predictions]

# Recalculate accuracy with the mapped predictions
correct_predictions = sum(1 for mapped_pred, label in zip(mapped_predictions, labels) if mapped_pred == label.strip().lower())
accuracy = correct_predictions / len(labels)
print(f"Corrected Accuracy: {accuracy:.2f}")


Corrected Accuracy: 0.75


In [53]:
# Load the test dataset
df_2 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv')  # Make sure to update the path to your dataset location


dataset = ANLIDataset(df_2)

# DataLoader setup
loader = DataLoader(dataset, batch_size=32, shuffle=False)  


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
       

In [54]:
from tqdm import tqdm
# Evaluation loop with tqdm
predictions = []
labels = []

for batch in tqdm(loader, desc="Evaluating"):
    input_sequences, batch_labels = batch
    # Tokenize the inputs. Adjust as per your tokenizer's requirement
    inputs = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Move tensors to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Forward pass through the model, specifying max_new_tokens
        output_sequences = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50  # Adjust as needed
        )
        # Decode the generated sequences to text
        pred_texts = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in output_sequences]
    predictions.extend(pred_texts)
    labels.extend(batch_labels)



Evaluating: 100%|██████████| 32/32 [00:34<00:00,  1.09s/it]


In [55]:
def map_prediction_to_label(prediction):
    mapping = {
        "yes": "entailment",
        "no": "contradiction",
        "it's impossible to say": "neutral"
    }
    return mapping.get(prediction.strip().lower(), "unknown")

mapped_predictions = [map_prediction_to_label(pred) for pred in predictions]

# Recalculate accuracy with the mapped predictions
correct_predictions = sum(1 for mapped_pred, label in zip(mapped_predictions, labels) if mapped_pred == label.strip().lower())
accuracy = correct_predictions / len(labels)
print(f"Corrected Accuracy: {accuracy:.2f}")


Corrected Accuracy: 0.64


In [56]:
# Load the test dataset
df_3 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv')  # Make sure to update the path to your dataset location


dataset = ANLIDataset(df_3)

# DataLoader setup
loader = DataLoader(dataset, batch_size=32, shuffle=False)  


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
       

In [57]:
from tqdm import tqdm
# Evaluation loop with tqdm
predictions = []
labels = []

for batch in tqdm(loader, desc="Evaluating"):
    input_sequences, batch_labels = batch
    # Tokenize the inputs. Adjust as per your tokenizer's requirement
    inputs = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Move tensors to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Forward pass through the model, specifying max_new_tokens
        output_sequences = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50  # Adjust as needed
        )
        # Decode the generated sequences to text
        pred_texts = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in output_sequences]
    predictions.extend(pred_texts)
    labels.extend(batch_labels)



Evaluating: 100%|██████████| 38/38 [00:44<00:00,  1.16s/it]


In [58]:
def map_prediction_to_label(prediction):
    mapping = {
        "yes": "entailment",
        "no": "contradiction",
        "it's impossible to say": "neutral"
    }
    return mapping.get(prediction.strip().lower(), "unknown")

mapped_predictions = [map_prediction_to_label(pred) for pred in predictions]

# Recalculate accuracy with the mapped predictions
correct_predictions = sum(1 for mapped_pred, label in zip(mapped_predictions, labels) if mapped_pred == label.strip().lower())
accuracy = correct_predictions / len(labels)
print(f"Corrected Accuracy: {accuracy:.2f}")


Corrected Accuracy: 0.63
