First we need to set up our environment.

This Jupyter Notebook needs the following packages installed:
- [PyTorch](https://pytorch.org/get-started/locally/)
- [transformers](https://huggingface.co/docs/transformers/installation)
- [datasets](https://huggingface.co/docs/datasets/installation)
- [opendelta](https://opendelta.readthedocs.io/en/latest/notes/installation.html)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext.data.metrics import bleu_score

import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# set cache directory if desired
# if you do not set a cache directory then default values are used (usually '~/.cache')
import os
CACHE_DIR=os.path.abspath(os.path.expanduser('path/to/cache')) # I donot understand this line
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

env: TRANSFORMERS_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache
env: HF_MODULES_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache
env: HF_DATASETS_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache


In [2]:
# import packages
from datasets import load_dataset, load_metric, Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    DefaultDataCollator,
    AutoModelForQuestionAnswering,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)

We will use the SQuAD dataset which is an english extractive QA dataset for the RC task.

In [34]:
# load dataset
# load dataset

import json
import os
from datasets import Dataset


# The datasets library is a library for loading and preprocessing datasets for machine learning. The Dataset class is a way to handle large datasets in a way that is memory efficient.
    


def read_data(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        return Dataset.from_list(list(json.load(f).values()))


filepath_1 = os.path.abspath('rc_traindev/squad_train.json')
filepath_2 = os.path.abspath('rc_traindev/squad_dev.json')
filepath_3 = os.path.abspath('rc_traindev/rc_train.json')
filepath_4 = os.path.abspath('rc_traindev/rc_dev.json')

filepath_5 = os.path.abspath('rc_traindev/rc_test_1.json')
filepath_6 = os.path.abspath('rc_traindev/rc_test_2.json')

squad_train = read_data(filepath_1)
squad_dev = read_data(filepath_2)
rc_train = read_data(filepath_3)
rc_dev = read_data(filepath_4)
rc_test = read_data(filepath_5)
rc_final = read_data(filepath_6)

print(squad_train)
print(rc_train)
print(rc_dev)
print(rc_test)
print(rc_final)

    


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 87599
})
Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 100
})
Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 100
})
Dataset({
    features: ['context', 'question', 'id'],
    num_rows: 177
})
Dataset({
    features: ['context', 'question', 'id'],
    num_rows: 1126
})


In [36]:
with open('predicted_answers_robertalarge_2.json', 'w') as f:
        json.dump(predictions, f)

Next we will preprocess the dataset (training and evaluation data).

In [124]:

# preprocess data

max_length = 500
stride = 128
model_checkpoint ="deepset/roberta-large-squad2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_squad_train = preprocess_dataset(squad_train, tokenizer)

tokenized_squad_dev = preprocess_dataset(squad_dev, tokenizer)

tokenized_rc_train = preprocess_dataset(rc_train, tokenizer)
tokenized_rc_dev = preprocess_dataset(rc_dev, tokenizer)



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

RC is often evaluated in terms of F1 on the answer word level.
Therefore we use the predictions to extract answers which will be compared with the gold answers.

## Model Training

First we set up the model.

Note that you have to re-run this cell if you want to start from scratch (i.e. using the pre-trained weights). Don't forget to re-initalize your trainer as well!

In [126]:
model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#model =  DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)
model =  AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model= accelerator.prepare(model)

Here we use the [transformers](https://huggingface.co/docs/transformers)' [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer).
You can modify training using [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments).

In [127]:
# set up training
# you might want to adapt the hyperparameters so that in runs in feasible time on your machine


training_args = TrainingArguments(
    output_dir="./models",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=32,
    num_train_epochs=5,  # max_steps will override this value
    # max_steps=1000,  # comment out if this is not wanted
    weight_decay=0.01,
    logging_dir='./logs',
    label_names=["start_positions", "end_positions"]
)

# data collator for batching
data_collator = DefaultDataCollator()

# the actual trainer which performs training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_rc_train,
    eval_dataset=tokenized_rc_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=get_evaluate_fn(
        rc_dev,
        tokenized_rc_dev["context_span"],
        tokenized_rc_dev["offset_mapping"],
    ),
)


In [128]:
# do an initial evaluation
trainer.evaluate()

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.784677028656006,
 'eval_exact_match': 16.0,
 'eval_f1': 18.25,
 'eval_runtime': 22.4068,
 'eval_samples_per_second': 4.463,
 'eval_steps_per_second': 0.179}

In [None]:
the result for #model =  DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)

{'eval_loss': 4.679508686065674, 'eval_exact_match': 24.0, 'eval_f1': 28.49171071176885, 'eval_runtime': 20.2195, 'eval_samples_per_second': 4.946, 'eval_steps_per_second': 0.198, 'epoch': 1.0}
{'eval_loss': 4.362856388092041, 'eval_exact_match': 19.0, 'eval_f1': 24.63933564422695, 'eval_runtime': 20.8955, 'eval_samples_per_second': 4.786, 'eval_steps_per_second': 0.191, 'epoch': 2.0}
{'eval_loss': 4.197320461273193, 'eval_exact_match': 19.0, 'eval_f1': 24.13933564422695, 'eval_runtime': 20.2452, 'eval_samples_per_second': 4.939, 'eval_steps_per_second': 0.198, 'epoch': 3.0}
{'train_runtime': 278.9048, 'train_samples_per_second': 1.076, 'train_steps_per_second': 0.011, 'train_loss': 0.6186825037002563, 'epoch': 3.0}
TrainOutput(global_step=3, training_loss=0.6186825037002563, metrics={'train_runtime': 278.9048, 'train_samples_per_second': 1.076, 'train_steps_per_second': 0.011, 'train_loss': 0.6186825037002563, 'epoch': 3.0})

the result for #model =  AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
{'eval_loss': 4.582829475402832, 'eval_exact_match': 23.0, 'eval_f1': 27.79257651263465, 'eval_runtime': 20.6068, 'eval_samples_per_second': 4.853, 'eval_steps_per_second': 0.194, 'epoch': 1.0}
{'eval_loss': 4.212737560272217, 'eval_exact_match': 21.0, 'eval_f1': 24.745396250287556, 'eval_runtime': 20.2435, 'eval_samples_per_second': 4.94, 'eval_steps_per_second': 0.198, 'epoch': 2.0}
{'eval_loss': 4.005221843719482, 'eval_exact_match': 18.0, 'eval_f1': 21.14583333333333, 'eval_runtime': 27.3378, 'eval_samples_per_second': 3.658, 'eval_steps_per_second': 0.146, 'epoch': 3.0}
{'eval_loss': 3.9222028255462646, 'eval_exact_match': 15.0, 'eval_f1': 17.233333333333334, 'eval_runtime': 19.6553, 'eval_samples_per_second': 5.088, 'eval_steps_per_second': 0.204, 'epoch': 4.0}
{'eval_loss': 3.880044460296631, 'eval_exact_match': 15.0, 'eval_f1': 17.233333333333334, 'eval_runtime': 22.1952, 'eval_samples_per_second': 4.505, 'eval_steps_per_second': 0.18, 'epoch': 5.0}
{'train_runtime': 492.1944, 'train_samples_per_second': 1.016, 'train_steps_per_second': 0.01, 'train_loss': 0.4992131233215332, 'epoch': 5.0}
TrainOutput(global_step=5, training_loss=0.4992131233215332, metrics={'train_runtime': 492.1944, 'train_samples_per_second': 1.016, 'train_steps_per_second': 0.01, 'train_loss': 0.4992131233215332, 'epoch': 5.0})

In [78]:
# perform training
trainer.train()



  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.582829475402832, 'eval_exact_match': 23.0, 'eval_f1': 27.79257651263465, 'eval_runtime': 20.6068, 'eval_samples_per_second': 4.853, 'eval_steps_per_second': 0.194, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.212737560272217, 'eval_exact_match': 21.0, 'eval_f1': 24.745396250287556, 'eval_runtime': 20.2435, 'eval_samples_per_second': 4.94, 'eval_steps_per_second': 0.198, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.005221843719482, 'eval_exact_match': 18.0, 'eval_f1': 21.14583333333333, 'eval_runtime': 27.3378, 'eval_samples_per_second': 3.658, 'eval_steps_per_second': 0.146, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.9222028255462646, 'eval_exact_match': 15.0, 'eval_f1': 17.233333333333334, 'eval_runtime': 19.6553, 'eval_samples_per_second': 5.088, 'eval_steps_per_second': 0.204, 'epoch': 4.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.880044460296631, 'eval_exact_match': 15.0, 'eval_f1': 17.233333333333334, 'eval_runtime': 22.1952, 'eval_samples_per_second': 4.505, 'eval_steps_per_second': 0.18, 'epoch': 5.0}
{'train_runtime': 492.1944, 'train_samples_per_second': 1.016, 'train_steps_per_second': 0.01, 'train_loss': 0.4992131233215332, 'epoch': 5.0}


TrainOutput(global_step=5, training_loss=0.4992131233215332, metrics={'train_runtime': 492.1944, 'train_samples_per_second': 1.016, 'train_steps_per_second': 0.01, 'train_loss': 0.4992131233215332, 'epoch': 5.0})

#   The Trainer class automatically saves the model after training. The model is saved in the directory specified by the output_dir argument in TrainingArguments. You can load the trained model using the from_pretrained method of the model class, like this:
model = ModelClass.from_pretrained("./models")
Replace ModelClass with the actual class of your model. For example, if you're using a BERT model, you would use BertForQuestionAnswering.from_pretrained.

Once you've loaded the trained model, you can use it to make predictions on your new test dataset. You can do this by calling the predict method of the Trainer class, like this:


predictions = trainer.predict(test_dataset)

In [3]:
# load dataset
# load dataset
# import packages
from datasets import load_dataset, load_metric, Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    DefaultDataCollator,
    AutoModelForQuestionAnswering,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)
    
import json
import os
from datasets import Dataset
from accelerate import Accelerator

# The datasets library is a library for loading and preprocessing datasets for machine learning. The Dataset class is a way to handle large datasets in a way that is memory efficient.
    


def read_data(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        return Dataset.from_list(list(json.load(f).values()))


filepath_1 = os.path.abspath('rc_traindev/squad_train.json')
filepath_2 = os.path.abspath('rc_traindev/squad_dev.json')
filepath_3 = os.path.abspath('rc_traindev/rc_train.json')
filepath_4 = os.path.abspath('rc_traindev/rc_dev.json')

filepath_5 = os.path.abspath('rc_traindev/rc_test_1.json')

squad_train = read_data(filepath_1)
squad_dev = read_data(filepath_2)
rc_train = read_data(filepath_3)
rc_dev = read_data(filepath_4)
rc_test = read_data(filepath_5)

print(squad_train)
print(rc_train)
print(rc_test)
print(squad_train[0])
print(rc_test)
print(rc_test[0])
    
    


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 87599
})
Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 100
})
Dataset({
    features: ['context', 'question', 'id'],
    num_rows: 177
})
{'id': '5733be284776f41900661182', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'q

In [4]:
max_length = 384
stride = 128
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs
 

In [6]:
def preprocess_validation_dataset(dataset: Dataset, tokenizer, max_length: int = 400, stride: int = 128):
    

    def preprocess_function(examples, tokenizer, max_length):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=max_length,
            truncation="only_second",
            stride=stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # Create a map from overflowing tokens to the original sample they came from
        sample_map = inputs.pop("overflow_to_sample_mapping")
        inputs["example_id"] = [examples["id"][i] for i in sample_map]

        # Modify the offset mapping to only keep offsets for the context
        for i in range(len(inputs["input_ids"])):
            sequence_ids = inputs.sequence_ids(i)
            offset = inputs["offset_mapping"][i]
            inputs["offset_mapping"][i] = [
                o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
            ]

        return inputs

    return dataset.map(
        preprocess_function,
        fn_kwargs=dict(tokenizer=tokenizer, max_length=max_length),
        batched=True,
        remove_columns=dataset.column_names,
    )


NameError: name 'Dataset' is not defined

In [7]:
 
model_checkpoint = "huggingface-course/bert-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

validation_dataset = rc_test.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=rc_test.column_names,
)
len(rc_test), len(validation_dataset)

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

(177, 551)

In [8]:
#tokenized_rc_dev = preprocess_dataset(rc_dev, tokenizer)


sample = validation_dataset[0]  # Get the first sample from the validation set

# Print the result
print(sample)


{'input_ids': [101, 158, 119, 156, 119, 4792, 1107, 2965, 1104, 2000, 10016, 102, 22143, 1225, 1136, 2519, 2000, 1235, 1103, 1248, 1544, 1104, 1103, 3116, 1432, 119, 1220, 1834, 5006, 1106, 2549, 1763, 2746, 1116, 9926, 119, 138, 8964, 5279, 5223, 1107, 1141, 2447, 119, 1109, 4442, 1110, 17314, 1118, 9150, 3754, 1149, 1104, 1122, 1120, 1344, 2420, 1107, 1103, 1168, 2447, 119, 1247, 1132, 3002, 1104, 1103, 1922, 1606, 18437, 1107, 1594, 1222, 1103, 24640, 1112, 1346, 1112, 1103, 5435, 1432, 119, 1109, 24640, 1173, 1215, 18437, 1106, 2035, 2882, 1980, 119, 4503, 18437, 1127, 1145, 1215, 1106, 4286, 19612, 119, 24249, 1127, 1215, 1111, 3944, 1196, 2256, 1180, 4137, 1293, 1152, 1589, 119, 1109, 2749, 1338, 1164, 1107, 18030, 1559, 119, 7026, 8102, 113, 19290, 22639, 1559, 24458, 114, 1758, 1210, 3501, 3892, 1104, 4018, 117, 1208, 2752, 1106, 1112, 8102, 1116, 12077, 1104, 12153, 131, 122, 119, 1760, 4231, 1107, 4018, 1209, 3118, 1107, 4018, 4895, 5376, 1852, 1118, 170, 2049, 119, 123, 119,

In [31]:
import torch

small_eval_set = rc_test
trained_checkpoint = "huggingface-course/bert-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)

eval_set =  small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=rc_test.column_names,
)


Map:   0%|          | 0/177 [00:00<?, ? examples/s]

In [32]:

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)

with torch.no_grad():
    outputs = trained_model(**batch)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [33]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(validation_dataset):
    example_to_features[feature["example_id"]].append(idx)


In [7]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted = {}

for example in eval_set_for_model:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted[example_id] = { "answers": {"text": [best_answer["text"]]}, "id": example_id}

NameError: name 'eval_set_for_model' is not defined

In [38]:
 with open('predicted_answers_3.json', 'w') as f:
        json.dump(predicted, f)

In [None]:
###### compare the result with baseline without any fine-tnuning. for rc_test data samples, improve 6% when fine-tuning on rc_train and rc_dev, but not trying on squad_traiin, squad_dev, because of computaional expense on M1 GPU ######

from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "deepset/roberta-large-squad2"
question_answerer = pipeline("question-answering", model=model_checkpoint)

test_data = rc_test  # Your test dataset

predictions = {}

# Make predictions for each context-question pair in the test data
for i, sample in enumerate(test_data):
    context = sample["context"]
    question = sample["question"]
    answer = question_answerer(question=question, context=context)
    
    prediction = {
        "answers": {"text": [answer["answer"]]},
        "id": str(i)
    }
    
    predictions[str(i)] = prediction

with open('predicted_answers_robertalarge_test.json', 'w') as f:
        json.dump(predictions, f)