In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score


tokenizer = AutoTokenizer.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [2]:
# Function to tokenize the premise, hypothesis, and reason
def preprocess_data(df):
    # Concatenate the reason with the hypothesis
    concatenated_hypotheses = df['hypothesis'] + " [SEP] " + df['reason']
    return tokenizer(df['premise'].tolist(), concatenated_hypotheses.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

# Custom Dataset class
class ANLIDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}  # Updated line
        item['labels'] = self.labels[idx]  # Assuming labels are simple lists or numpy arrays, no need for tensor conversion here
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
# Load the test dataset
df = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv')  # Make sure to update the path to your dataset location
tokenized_data = preprocess_data(df)
dataset = ANLIDataset(tokenized_data, df['label'].tolist())


In [4]:
df.head()

Unnamed: 0,uid,premise,hypothesis,label,reason
0,4aae63a8-fcf7-406c-a2f3-50c31c5934a9,Ernest Jones is a British jeweller and watchma...,The first Ernest Jones store was opened on the...,0,"The first store was opened in London, which is..."
1,c577b92c-78fb-4e1d-ae1d-34133609c142,Old Trafford is a football stadium in Old Traf...,There are only 10 larger football stadiums in ...,0,The text says that it is the 11th largest foot...
2,26936cd9-1a5a-4a2b-9fca-899d61880ca0,Magnus is a Belgian joint dance project of Tom...,"""The body gave you everything"" album was not r...",0,"it was released on March 29, 2004. ""not this b..."
3,cd977941-273b-4748-a5d2-6c7234a2a302,Shadowboxer is a 2005 crime thriller film dire...,Shadowboxer was written and directed by Lee Da...,1,It is not know who wrote the Shadowboxer. The ...
4,1a9eae8f-27d9-47ba-80b8-7d1402ee524a,"Takaaki Kajita (梶田 隆章 , Kajita Takaaki ) is a ...","Arthur B. McDonald is a Japanese physicist, kn...",2,Arthur B. McDonald is Canadian in the context.


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         1000 non-null   object
 1   premise     1000 non-null   object
 2   hypothesis  1000 non-null   object
 3   label       1000 non-null   int64 
 4   reason      1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [6]:
from transformers import AutoModelForSequenceClassification

# DataLoader
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Model Initialization
model = AutoModelForSequenceClassification.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [7]:
from tqdm.auto import tqdm

# Evaluation function with progress bar
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    progress_bar = tqdm(data_loader, desc='Evaluating', unit='batch', leave=False)
    
    with torch.no_grad():
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# Run evaluation
accuracy = evaluate(model, data_loader)
print(f'Accuracy: {accuracy}')

Evaluating:   0%|          | 0/32 [00:00<?, ?batch/s]

Accuracy: 0.702


In [10]:
# Load Round 2 dataset
df_r2 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv')  # Make sure to adjust the path accordingly
tokenized_data_r2 = preprocess_data(df_r2)
dataset_r2 = ANLIDataset(tokenized_data_r2, df_r2['label'].tolist())

# DataLoader for Round 2
data_loader_r2 = DataLoader(dataset_r2, batch_size=16, shuffle=False)

model.eval()  # Set the model to evaluation mode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [11]:
# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    progress_bar = tqdm(data_loader, desc='Evaluating')
    
    with torch.no_grad():
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# Run evaluation for Round 2
accuracy_r2 = evaluate(model, data_loader_r2)
print(f'Round 2 Accuracy: {accuracy_r2}')

Evaluating:   0%|          | 0/63 [00:00<?, ?it/s]

Round 2 Accuracy: 0.59


In [12]:
# Load Round 3 dataset
df_r3 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv')  # Ensure this path points to your Round 3 dataset
tokenized_data_r3 = preprocess_data(df_r3)
dataset_r3 = ANLIDataset(tokenized_data_r3, df_r3['label'].tolist())

# DataLoader for Round 3
data_loader_r3 = DataLoader(dataset_r3, batch_size=16, shuffle=False)

# Model and device setup
model.eval()  # Ensure the model is in evaluation mode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [13]:
# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    progress_bar = tqdm(data_loader, desc='Evaluating')
    
    with torch.no_grad():
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# Run evaluation for Round 3
accuracy_r3 = evaluate(model, data_loader_r3)
print(f'Round 3 Accuracy: {accuracy_r3}')

Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

Round 3 Accuracy: 0.5775
