In [None]:
!pip install datasets==2.14.6
!pip install transformers
!pip install evaluate
!pip install --no-cache-dir transformers sentencepiece
!pip install accelerate -U

In [None]:
import torch
from datasets import Dataset, DatasetDict
from datasets import concatenate_datasets
from torch.utils.data import DataLoader
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import evaluate
import accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_name = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_data = np.load('/content/drive/MyDrive/data/SP-train.npy', allow_pickle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def convert_to_dataset_type(array):
    df = pd.DataFrame(array.tolist())
    col = ['id','distractor1','distractor2','distractor(unsure)']
    for c in col:
      df[c] = df[c].astype(str)
    df['label'] = df['label'].astype(int)
    data = Dataset.from_pandas(df, split = "train")
    return data

def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples["question"]]
    first_sentences = sum(first_sentences, [])
    second_sentences = [item for item in examples["choice_list"]]
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, add_special_tokens = True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
train_dataset = convert_to_dataset_type(train_data)
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print(f"Training set size: {len(tokenized_train)}")

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Training set size: 507


In [None]:
original_data = tokenized_train.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
semantic_data = tokenized_train.filter(lambda data: "_SR" in data["id"])
context_data = tokenized_train.filter(lambda data: "_CR" in data["id"])

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

In [None]:
def split_dataset(dataset):
    df = dataset.to_pandas()
    train_temp, temp_df = train_test_split(df, test_size = 0.2, shuffle = False)
    val_df, test_df = train_test_split(temp_df, test_size = 0.5, shuffle = False)
    train_dataset = Dataset.from_pandas(train_temp)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "val": val_dataset,
        "test": test_dataset
    })
    return dataset_dict

original_dataset = split_dataset(original_data)
semantic_dataset = split_dataset(semantic_data)
context_dataset = split_dataset(context_data)

train_dataset = concatenate_datasets([original_dataset["train"], semantic_dataset["train"], context_dataset["train"]])
val_dataset = concatenate_datasets([original_dataset["val"], semantic_dataset["val"], context_dataset["val"]])
test_dataset = concatenate_datasets([original_dataset["test"], semantic_dataset["test"], context_dataset["test"]])

train_dataset = train_dataset.shuffle(seed=42)
val_dataset = val_dataset.shuffle(seed=42)
test_dataset = test_dataset.shuffle(seed=42)

my_dataset = DatasetDict({
    "train": train_dataset,
    "val": val_dataset,
    "test" : test_dataset
    })

print("Training dataset size:", len(my_dataset['train']))
print("Validation dataset size:", len(my_dataset['val']))
print("Testing dataset size:", len(my_dataset['test']))

Training dataset size: 405
Validation dataset size: 51
Testing dataset size: 51


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
@dataclass
class DataCollatorForMultipleChoice:

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def get_final_dataset(dataset):
    tokenized_dataset = dataset.rename_column("label", "labels")
    tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order'])
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

final_dataset = get_final_dataset(my_dataset)
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModelForMultipleChoice.from_pretrained(model_name, ignore_mismatched_sizes=True).to(device)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bs = 4
lr = 3e-5
epochs = 3
num_samples = len(my_dataset["train"])

num_training_steps = (num_samples // bs) * epochs
batches_per_epoch = num_samples // bs
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
lr_scheduler = get_scheduler(name = "linear", optimizer = optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps)

In [None]:
training_arguments = TrainingArguments(
    output_dir = "./BERT_output",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    learning_rate = lr,
    num_train_epochs = epochs,
    per_device_train_batch_size = bs,
    per_device_eval_batch_size = bs,
    save_strategy = "epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    data_collator = DataCollatorForMultipleChoice(tokenizer = tokenizer),
    train_dataset = final_dataset["train"],
    eval_dataset = final_dataset["val"],
    optimizers = (optimizer, lr_scheduler),
    compute_metrics = compute_metrics
)

In [None]:
print(f'Training {model_name}')
train_result = trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training bert-large-uncased


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9581,0.70101,0.627451
2,0.3029,0.502093,0.764706
3,0.0896,0.529844,0.784314


In [None]:
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
def evaluate_accuracy(dataset):

    total_answers = 0
    correct_answers = 0
    model.eval()
    for sample in dataset:
        ques = sample['question'].strip()
        choices = sample['choice_list']
        true_label = sample['label']
        choice1, choice2, choice3, choice4 = choices[0].strip(), choices[1].strip(), choices[2].strip(), choices[3].strip()
        inputs = tokenizer([[ques, choice1], [ques, choice2], [ques, choice3], [ques, choice4]], return_tensors = "pt", padding = True).to(device)
        labels = torch.tensor(true_label).unsqueeze(0).to(device)

        # Pass the input through the model to obtain predictions
        with torch.no_grad():
            outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
            # outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()})

        logits = outputs.logits
        predicted_class = logits.argmax().item()

        if predicted_class == true_label:
            correct_answers += 1

        total_answers += 1

    accuracy = correct_answers/total_answers
    return 100 * accuracy

In [None]:
checkpoint_path = "./BERT_output/checkpoint-204"
model = AutoModelForMultipleChoice.from_pretrained(checkpoint_path, ignore_mismatched_sizes=True).to(device)

In [None]:
original_acc = evaluate_accuracy(original_dataset['test'])
semantic_acc = evaluate_accuracy(semantic_dataset['test'])
context_acc = evaluate_accuracy(context_dataset['test'])
overall_acc = evaluate_accuracy(my_dataset['test'])

In [None]:
print("Accuracy on Original Dataset:", original_acc)
print("Accuracy on Semantic Dataset:", semantic_acc)
print("Accuracy on Context Dataset:", context_acc)
print("Overall Accuracy:", overall_acc)

Accuracy on Original Dataset: 64.70588235294117
Accuracy on Semantic Dataset: 76.47058823529412
Accuracy on Context Dataset: 58.82352941176471
Overall Accuracy: 66.66666666666666
