In [None]:
!pip install datasets==2.14.6
!pip install transformers
!pip install evaluate
!pip install --no-cache-dir transformers sentencepiece
!pip install accelerate -U

In [None]:
import torch
from datasets import Dataset, DatasetDict
from datasets import concatenate_datasets
from torch.utils.data import DataLoader
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import evaluate
import accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# model_name = "FacebookAI/roberta-large"
model_name = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_data = np.load('/content/drive/MyDrive/data/SP-train.npy', allow_pickle=True)

In [None]:
def convert_to_dataset_type(array):
    df = pd.DataFrame(array.tolist())
    col = ['id','distractor1','distractor2','distractor(unsure)']
    for c in col:
      df[c] = df[c].astype(str)
    df['label'] = df['label'].astype(int)
    data = Dataset.from_pandas(df, split = "train")
    return data

def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples["question"]]
    first_sentences = sum(first_sentences, [])
    second_sentences = [item for item in examples["choice_list"]]
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, add_special_tokens = True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
train_dataset = convert_to_dataset_type(train_data)
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print(f"Training set size: {len(tokenized_train)}")

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Training set size: 507


In [None]:
def split_dataset(dataset):
    df = dataset.to_pandas()
    train_temp, temp_df = train_test_split(df, test_size=0.2, shuffle=True)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, shuffle=True)
    train_dataset = Dataset.from_pandas(train_temp)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "valid": val_dataset,
        "test": test_dataset
    })
    return dataset_dict

my_dataset = split_dataset(tokenized_train)
print("Training dataset size:", len(my_dataset['train']))
print("Validation dataset size:", len(my_dataset['valid']))
print("Testing dataset size:", len(my_dataset['test']))

Training dataset size: 405
Validation dataset size: 51
Testing dataset size: 51


In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def get_final_dataset(dataset):
    tokenized_dataset = dataset.rename_column("label", "labels")
    tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order'])
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

tokenized_datasets = get_final_dataset(my_dataset)
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForMultipleChoice.from_pretrained(model_name, ignore_mismatched_sizes=True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [None]:
batch_size = 4
lr = 3e-5
num_epochs = 3

num_training_steps = (len(my_dataset["train"]) // batch_size) * num_epochs
batches_per_epoch = len(my_dataset["train"]) // batch_size

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
training_args = TrainingArguments(
    output_dir = "./output",
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=20,
    logging_strategy="steps",
    learning_rate=lr,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    report_to=None,
    save_strategy="no"
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
print(f'Training {model_name}')
train_result = trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training bert-large-uncased


Step,Training Loss,Validation Loss,Accuracy
20,1.3123,1.400763,0.352941
40,1.2512,1.245125,0.392157
60,0.9833,1.218071,0.470588
80,1.0239,0.957672,0.627451
100,0.7801,0.802805,0.647059
120,0.5137,0.803011,0.72549
140,0.4791,0.868537,0.72549
160,0.4354,0.716551,0.823529
180,0.4071,0.703431,0.823529
200,0.2357,0.60789,0.823529


In [None]:
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
def evaluate_accuracy(dataset, pred_list=False):
    total_answers = 0
    correct_answers = 0
    predictions = []

    model.eval()

    for i in dataset:
        prompt = i['question'].strip()
        candidates = i['choice_list']
        true_label_original = i['label']
        candidate_1, candidate_2, candidate_3, candidate_4 = candidates[0].strip(), candidates[1].strip(), candidates[2].strip(), candidates[3].strip()

        inputs = tokenizer([[prompt, candidate_1], [prompt, candidate_2], [prompt, candidate_3], [prompt, candidate_4]],
                           return_tensors="pt", padding=True).to("cuda")

        labels = torch.tensor(true_label_original).unsqueeze(0).to("cuda")  # Batch size 1

        # Pass the input through the model to obtain predictions
        with torch.no_grad():
            outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)

        logits = outputs.logits
        predicted_class = logits.argmax().item()

        predictions.append(predicted_class)

        if predicted_class == true_label_original:
            correct_answers += 1
        total_answers += 1

    # Calculate accuracy
    accuracy = correct_answers / total_answers

    # Round accuracy to three decimal places
    rounded_accuracy = round(accuracy, 3)
    print("Accuracy is", rounded_accuracy)

    if pred_list:
        return rounded_accuracy, predictions
    return rounded_accuracy

In [None]:
acc = evaluate_accuracy(my_dataset['test'])

Accuracy is 0.843
