In [8]:
# Import Libraries and Load Model/Tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm.auto import tqdm

model_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [9]:
# MNLI-m test dataset

df_mnli_matched = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_matched.csv')

# MNLI-mm test dataset

df_mnli_mismatched = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_mismatched.csv')

# ANLI test dataset round 1

df_anli_r1 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv')

# ANLI test dataset round 2

df_anli_r2 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv')

# ANLI test dataset round 3

df_anli_r3 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv')

# SNLI test dataset

df_snli = pd.read_csv('/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv')

In [25]:
def preprocess_data(df, dataset_type):
    # Explicitly copy the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    if dataset_type in ['mnli_matched', 'mnli_mismatched']:
        df['premise'] = df['premise'].astype(str)
        df['hypothesis'] = df['hypothesis'].astype(str)
        tokenized_inputs = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    elif dataset_type in ['anli_r1', 'anli_r2', 'anli_r3']:
        df['premise'] = df['premise'].astype(str)
        df['hypothesis'] = df['hypothesis'].astype(str)
        df['reason'] = df.get('reason', '').fillna('').astype(str)  # Safely handle 'reason' column
        concatenated_hypotheses = df['hypothesis'] + " [SEP] " + df['reason']
        tokenized_inputs = tokenizer(df['premise'].tolist(), concatenated_hypotheses.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    elif dataset_type == 'snli':
        df['sentence1'] = df['sentence1'].astype(str)
        df['sentence2'] = df['sentence2'].astype(str)
        tokenized_inputs = tokenizer(df['sentence1'].tolist(), df['sentence2'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    else:
        raise ValueError("Dataset type not recognized.")
    return tokenized_inputs


In [26]:
def predict_dataset(df, dataset_type, batch_size=8):
    probabilities = []
    num_batches = (len(df) + batch_size - 1) // batch_size
    
    model.eval()
    
    for i in tqdm(range(num_batches), desc=f"Predicting {dataset_type}"):
        batch = df.iloc[i * batch_size:(i + 1) * batch_size]
        tokenized_inputs = preprocess_data(batch, dataset_type)
        tokenized_inputs = {key: value.to(device) for key, value in tokenized_inputs.items()}
        
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            probabilities.extend(probs)
        
        torch.cuda.empty_cache()
    
    # Convert the probabilities to a DataFrame
    probs_df = pd.DataFrame(probabilities, columns=['Entailment', 'Neutral', 'Contradiction'])
    
    # Save the DataFrame to a CSV file
    probs_df.to_csv(f'/kaggle/working/{dataset_type}_predictions.csv', index=False)
    
    return probabilities


In [27]:
# Make predictions and save to CSV
predict_dataset(df_anli_r1, 'anli_r1', batch_size=32)
predict_dataset(df_anli_r2, 'anli_r2', batch_size=32)
predict_dataset(df_anli_r3, 'anli_r3', batch_size=32)
predict_dataset(df_mnli_matched, 'mnli_matched', batch_size=32)
predict_dataset(df_mnli_mismatched, 'mnli_mismatched', batch_size=32)
predict_dataset(df_snli, 'snli', batch_size=32)

Predicting anli_r1:   0%|          | 0/32 [00:00<?, ?it/s]

Predicting anli_r2:   0%|          | 0/32 [00:00<?, ?it/s]

Predicting anli_r3:   0%|          | 0/38 [00:00<?, ?it/s]

Predicting mnli_matched:   0%|          | 0/307 [00:00<?, ?it/s]

Predicting mnli_mismatched:   0%|          | 0/308 [00:00<?, ?it/s]

Predicting snli:   0%|          | 0/313 [00:00<?, ?it/s]

[array([0.01245091, 0.92709255, 0.06045656], dtype=float32),
 array([0.75276625, 0.2422506 , 0.00498317], dtype=float32),
 array([2.5352481e-04, 4.4935774e-03, 9.9525285e-01], dtype=float32),
 array([0.00584444, 0.9907362 , 0.00341932], dtype=float32),
 array([0.27834842, 0.71857524, 0.00307636], dtype=float32),
 array([0.0028402 , 0.44119415, 0.55596566], dtype=float32),
 array([9.8390770e-01, 1.5161245e-02, 9.3100365e-04], dtype=float32),
 array([5.5926776e-04, 9.9815720e-01, 1.2835017e-03], dtype=float32),
 array([0.06783539, 0.05611929, 0.8760454 ], dtype=float32),
 array([0.00152789, 0.96415716, 0.03431489], dtype=float32),
 array([0.9415842 , 0.05680466, 0.00161102], dtype=float32),
 array([7.3882897e-04, 6.1629355e-02, 9.3763179e-01], dtype=float32),
 array([0.21264993, 0.7832928 , 0.00405736], dtype=float32),
 array([9.8421258e-01, 1.5202362e-02, 5.8507401e-04], dtype=float32),
 array([8.8045635e-05, 2.6302501e-03, 9.9728167e-01], dtype=float32),
 array([4.8785405e-05, 4.315555