In [1]:
# Import Libraries and Load Model/Tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm.auto import tqdm

model_name = "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/890M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=4096, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((4096,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=4096, out_features=4096, bias=True)
                (key): Linear(in_features=4096, out_features=4096, bias=True)
                (value): Linear(in_features=4096, out_featur

In [2]:
# MNLI-m test dataset

df_mnli_matched = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_matched.csv')

# MNLI-mm test dataset

df_mnli_mismatched = pd.read_csv('/kaggle/input/nli-dataset-for-sentence-understanding/mnli_test_mismatched.csv')

# ANLI test dataset round 1

df_anli_r1 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv')

# ANLI test dataset round 2

df_anli_r2 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv')

# ANLI test dataset round 3

df_anli_r3 = pd.read_csv('/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv')

# SNLI test dataset

df_snli = pd.read_csv('/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv')

In [3]:
def preprocess_data(df, dataset_type):
    # Explicitly copy the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    if dataset_type in ['mnli_matched', 'mnli_mismatched']:
        df['premise'] = df['premise'].astype(str)
        df['hypothesis'] = df['hypothesis'].astype(str)
        tokenized_inputs = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    elif dataset_type in ['anli_r1', 'anli_r2', 'anli_r3']:
        df['premise'] = df['premise'].astype(str)
        df['hypothesis'] = df['hypothesis'].astype(str)
        df['reason'] = df.get('reason', '').fillna('').astype(str)  # Safely handle 'reason' column
        concatenated_hypotheses = df['hypothesis'] + " [SEP] " + df['reason']
        tokenized_inputs = tokenizer(df['premise'].tolist(), concatenated_hypotheses.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    elif dataset_type == 'snli':
        df['sentence1'] = df['sentence1'].astype(str)
        df['sentence2'] = df['sentence2'].astype(str)
        tokenized_inputs = tokenizer(df['sentence1'].tolist(), df['sentence2'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    else:
        raise ValueError("Dataset type not recognized.")
    return tokenized_inputs


In [4]:
def predict_dataset(df, dataset_type, batch_size=32):
    probabilities = []
    num_batches = (len(df) + batch_size - 1) // batch_size
    
    model.eval()
    
    for i in tqdm(range(num_batches), desc=f"Predicting {dataset_type}"):
        batch = df.iloc[i * batch_size:(i + 1) * batch_size]
        tokenized_inputs = preprocess_data(batch, dataset_type)
        tokenized_inputs = {key: value.to(device) for key, value in tokenized_inputs.items()}
        
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            probabilities.extend(probs)
        
        torch.cuda.empty_cache()
    
    # Convert the probabilities to a DataFrame
    probs_df = pd.DataFrame(probabilities, columns=['Entailment', 'Neutral', 'Contradiction'])
    
    # Save the DataFrame to a CSV file
    probs_df.to_csv(f'/kaggle/working/albert_{dataset_type}_predictions.csv', index=False)
    
    return probabilities


In [5]:
# Make predictions and save to CSV
predict_dataset(df_anli_r1, 'anli_r1', batch_size=32)
predict_dataset(df_anli_r2, 'anli_r2', batch_size=32)
predict_dataset(df_anli_r3, 'anli_r3', batch_size=32)
predict_dataset(df_mnli_matched, 'mnli_matched', batch_size=32)
predict_dataset(df_mnli_mismatched, 'mnli_mismatched', batch_size=32)
predict_dataset(df_snli, 'snli', batch_size=32)

Predicting anli_r1:   0%|          | 0/32 [00:00<?, ?it/s]

Predicting anli_r2:   0%|          | 0/32 [00:00<?, ?it/s]

Predicting anli_r3:   0%|          | 0/38 [00:00<?, ?it/s]

Predicting mnli_matched:   0%|          | 0/307 [00:00<?, ?it/s]

Predicting mnli_mismatched:   0%|          | 0/308 [00:00<?, ?it/s]

Predicting snli:   0%|          | 0/313 [00:00<?, ?it/s]

[array([0.00865253, 0.947434  , 0.04391349], dtype=float32),
 array([0.7403315 , 0.25643367, 0.00323485], dtype=float32),
 array([0.00467654, 0.06048067, 0.93484277], dtype=float32),
 array([0.034056  , 0.9566873 , 0.00925674], dtype=float32),
 array([0.49876088, 0.49926996, 0.00196922], dtype=float32),
 array([0.0039013 , 0.09436977, 0.901729  ], dtype=float32),
 array([9.7817034e-01, 2.1051416e-02, 7.7824458e-04], dtype=float32),
 array([0.00494535, 0.9854117 , 0.00964287], dtype=float32),
 array([0.08555043, 0.07503659, 0.839413  ], dtype=float32),
 array([0.00478786, 0.9408013 , 0.05441086], dtype=float32),
 array([0.8867467 , 0.11230092, 0.0009523 ], dtype=float32),
 array([0.00545288, 0.07471507, 0.91983205], dtype=float32),
 array([0.13802557, 0.8554469 , 0.00652753], dtype=float32),
 array([9.738555e-01, 2.585009e-02, 2.944479e-04], dtype=float32),
 array([0.00383149, 0.03916397, 0.95700455], dtype=float32),
 array([0.00132432, 0.00739372, 0.99128187], dtype=float32),
 array([9