In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm.auto import tqdm


# Define model names for easy reference
model_names = {
    "deberta": "microsoft/deberta-large-mnli",
    "albert": "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli",
    "roberta": "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
}

# Load models and tokenizers
models = {name: AutoModelForSequenceClassification.from_pretrained(model_names[name]) for name in model_names}
tokenizers = {name: AutoTokenizer.from_pretrained(model_names[name]) for name in model_names}

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"


config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/890M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [2]:
# MNLI-m test dataset

df_mnlim = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_matched.csv')

# MNLI-mm test dataset

df_mnlimm = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_mismatched.csv')

# ANLI test dataset round 1

df_anli1 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv')

# ANLI test dataset round 2

df_anli2 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv')

# ANLI test dataset round 3

df_anli3 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv')

# SNLI test dataset

df_snli = pd.read_csv('/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv')

In [3]:
datasets = {
    "mnli_matched": df_mnlim,
    "mnli_mismatched": df_mnlimm,
    "anli1": df_anli1,
    "anli2": df_anli2,
    "anli3": df_anli3,
    "snli": df_snli
}

In [4]:
# Function for data preprocessing (specifically for ANLI dataset in this setup)
def preprocess_data(df, tokenizer_name):
    concatenated_hypotheses = df['hypothesis'] + " [SEP] " + df['reason']
    return tokenizers[tokenizer_name](df['premise'].tolist(), concatenated_hypotheses.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")


In [8]:
def tokenize_and_predict_to_df(tokenizer, model, df, dataset_name, preprocess_fn=None):
    all_probs = []
    batch_size = 8  # Adjust based on your GPU memory
    model.eval()
    model.to(device)
    
    # Determine column names based on dataset
    text_columns = ['sentence1', 'sentence2'] if 'sentence1' in df.columns else ['premise', 'hypothesis']
    
    for batch_start in tqdm(range(0, len(df), batch_size), desc=f"Predicting {dataset_name}"):
        batch_end = min(batch_start + batch_size, len(df))
        batch = df.iloc[batch_start:batch_end]
        
        if preprocess_fn:
            tokenized_inputs = preprocess_fn(batch, tokenizer)
        else:
            if dataset_name == "snli":
                tokenized_inputs = tokenizer(batch['sentence1'].tolist(), batch['sentence2'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
            else:
                tokenized_inputs = tokenizer(batch['premise'].tolist(), batch['hypothesis'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
        
        tokenized_inputs = {key: value.to(device) for key, value in tokenized_inputs.items()}
        
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            all_probs.extend(probs)
        
        torch.cuda.empty_cache()
    
    probs_df = pd.DataFrame(all_probs, columns=['Entailment', 'Neutral', 'Contradiction'])
    result_df = pd.concat([df[text_columns].reset_index(drop=True), probs_df], axis=1)
    
    return result_df


In [9]:
def process_datasets_and_save_to_csv(model_name, datasets):
    print(f"Processing with {model_name}")
    model = models[model_name].to(device)
    tokenizer = tokenizers[model_name]

    for dataset_name, df in datasets.items():
        print(f"Processing dataset: {dataset_name}")
        result_df = tokenize_and_predict_to_df(tokenizer, model, df, dataset_name, preprocess_fn=preprocess_data if "anli" in dataset_name else None)

        csv_file_name = f"{model_name}_{dataset_name}_results.csv"
        result_df.to_csv(csv_file_name, index=False)
        print(f"Results for {dataset_name} saved to {csv_file_name}")

    model.to('cpu')
    del model
    del tokenizer
    torch.cuda.empty_cache()
    print(f"Finished processing and saving results for {model_name}")


In [None]:
for name in model_names.keys():
    process_datasets_and_save_to_csv(name, datasets)


Processing with deberta
Processing dataset: mnli_matched


Predicting mnli_matched:   0%|          | 0/1225 [00:00<?, ?it/s]

Results for mnli_matched saved to deberta_mnli_matched_results.csv
Processing dataset: mnli_mismatched


Predicting mnli_mismatched:   0%|          | 0/1231 [00:00<?, ?it/s]