First we need to set up our environment.

This Jupyter Notebook needs the following packages installed:
- [PyTorch](https://pytorch.org/get-started/locally/)
- [transformers](https://huggingface.co/docs/transformers/installation)
- [datasets](https://huggingface.co/docs/datasets/installation)
- [opendelta](https://opendelta.readthedocs.io/en/latest/notes/installation.html)

In [None]:
# prepare environment in Colab
!pip install torch
!pip install transformers==4.28.0
!pip install datasets
!pip install evaluate
!pip install torchtext==0.6

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext.data.metrics import bleu_score

import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# set cache directory if desired
# if you do not set a cache directory then default values are used (usually '~/.cache')
import os
CACHE_DIR=os.path.abspath(os.path.expanduser('path/to/cache')) # I donot understand this line
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

env: TRANSFORMERS_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache
env: HF_MODULES_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache
env: HF_DATASETS_CACHE=/Users/rongwang/Desktop/DL project/path/to/cache


In [13]:
import torch
torch.cuda.empty_cache()
torch.backends.mps.is_available()
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
## and then move your model and data to the device before you train or eval. Have fun folks!

In [14]:
# import packages
from datasets import load_dataset, load_metric, Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    DefaultDataCollator,
    AutoModelForQuestionAnswering,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)

In [None]:
# load dataset
# load dataset
import pandas as pd
import numpy as np
from google.colab import drive

drive.mount("/content/gdrive")


import json
import os
from datasets import Dataset

# The datasets library is a library for loading and preprocessing datasets for machine learning. The Dataset class is a way to handle large datasets in a way that is memory efficient.



def read_data(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        return Dataset.from_list(list(json.load(f).values()))


filepath_1 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/squad_train.json')
filepath_2 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/squad_dev.json')
filepath_3 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/rc_train.json')
filepath_4 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/rc_dev.json')

filepath_5 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/rc_test_1.json')

filepath_6 = os.path.abspath ('/content/gdrive/My Drive/data/DL_project/rc_test_1.json')

squad_train = read_data(filepath_1)
squad_dev = read_data(filepath_2)
rc_train = read_data(filepath_3)
rc_dev = read_data(filepath_4)
rc_test = read_data(filepath_5)

print(squad_train)
print(rc_train)
print(rc_dev)
print(rc_test)
print(rc_test[0])
    

In [15]:
# load dataset
# load dataset

import json
import os
from datasets import Dataset


# The datasets library is a library for loading and preprocessing datasets for machine learning. The Dataset class is a way to handle large datasets in a way that is memory efficient.
    

def read_data(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        return Dataset.from_list(list(json.load(f).values()))


filepath_1 = os.path.abspath('rc_traindev/squad_train.json')
filepath_2 = os.path.abspath('rc_traindev/squad_dev.json')
filepath_3 = os.path.abspath('rc_traindev/rc_train.json')
filepath_4 = os.path.abspath('rc_traindev/rc_dev.json')

filepath_5 = os.path.abspath('rc_traindev/rc_test_1.json')
filepath_6 = os.path.abspath('rc_traindev/rc_test_2.json')

squad_train = read_data(filepath_1)
squad_dev = read_data(filepath_2)
rc_train = read_data(filepath_3)
rc_dev = read_data(filepath_4)
rc_test = read_data(filepath_5)
rc_final = read_data(filepath_6)

print(squad_train)
print(rc_train)
print(rc_dev)
print(rc_test)
print(rc_final)


    


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 87599
})
Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 100
})
Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 100
})
Dataset({
    features: ['context', 'question', 'id'],
    num_rows: 177
})
Dataset({
    features: ['context', 'question', 'id'],
    num_rows: 1126
})


Next we will preprocess the dataset (training and evaluation data).

In [16]:

def preprocess_dataset(dataset: Dataset, tokenizer, max_length: int = None):
    
    # dataset preprocessing function which can be used with datasets.map
    # modified from https://huggingface.co/docs/transformers/tasks/question_answering
    
    # this function takes examples and extracts span start and span end on the token level as labels,
    # contexts are truncated to fit into the model and samples are padded to the max_length,
    # and information needed for evaluation is extracted
    
    def preprocess_function(examples, tokenizer, max_length):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=max_length,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs["offset_mapping"]
        answers = examples["answers"]
        start_positions = []
        end_positions = []
        context_spans = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1
            context_spans.append((context_start, context_end))

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        inputs["context_span"] = context_spans
        return inputs

    return dataset.map(
        preprocess_function,
        fn_kwargs=dict(tokenizer=tokenizer, max_length=max_length),
        batched=True,
        remove_columns=dataset.column_names,
    )

In [18]:
# preprocess data

max_length = 500 ## max length of context + question in tokens, maybe it was too expensive for the GPU 

model_checkpoint ="deepset/roberta-large-squad2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

tokenized_squad_train = preprocess_dataset(squad_train, tokenizer)

tokenized_squad_dev = preprocess_dataset(squad_dev, tokenizer)

tokenized_rc_train = preprocess_dataset(rc_train, tokenizer)
tokenized_rc_dev = preprocess_dataset(rc_dev, tokenizer)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [20]:
def get_evaluate_fn(dataset: Dataset, context_spans, offset_mappings):
    # set up metric and labels
    squad_metric = load_metric("squad")
    
    #Handle missing keys: If some samples might not have an 'answers' key, you can use the dict.get() method, which returns None if the key is not present in the dictionary.
    # Here's how you can modify your code:
    references = [{"answers": sample.get("answers"), "id": sample.get("id")} for sample in dataset]


    # this function extracts the span answers and computes the f1 score
    # note that we use an approximation for extracting the best span, i.e. we do not consider all possibilities here
    def evaluate(predictions: EvalPrediction):
        preds_start_probs = predictions.predictions[0]
        preds_end_probs = predictions.predictions[1]
        preds_start_idx = preds_start_probs.argmax(axis=1)
        preds_end_idx = preds_end_probs.argmax(axis=1)
        predictions = [
            {
                "prediction_text": context[offset_mapping[pred_start_idx][0] : offset_mapping[pred_end_idx][1] + 1]
                if context_start_idx <= pred_start_idx <= context_end_idx
                and context_start_idx <= pred_end_idx <= context_end_idx
                else "",
                "id": sample_id,
            }
            for sample_id, context, (
                context_start_idx,
                context_end_idx,
            ), offset_mapping, pred_start_idx, pred_end_idx in zip(
                dataset["id"], dataset["context"], context_spans, offset_mappings, preds_start_idx, preds_end_idx
            )
        ]
        return squad_metric.compute(predictions=predictions, references=references)

    return evaluate

In [21]:

import torch
torch.cuda.empty_cache()
torch.backends.mps.is_available()
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
## and then move your model and data to the device before you train or eval. Have fun folks!

#installing pytorch to run on mac M1, using "mps" instead of 'cuda'.



In [22]:
# set up training arguments and pass to trainer, For few-shot learning, I only trained on rc_train and rc_dev, altogether 200 samples, use 5 epochs 

training_args = TrainingArguments(
    output_dir="./results",    # if running on Colab, set this to "/content/results"
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=32,
    num_train_epochs=5,  # max_steps will override this value
    # max_steps=1000,  # comment out if this is not wanted
    weight_decay=0.01,
    #logging_dir='./logs',
    label_names=["start_positions", "end_positions"]
)

# data collator for batching
data_collator = DefaultDataCollator()

# the actual trainer which performs training and evaluation
trainer = Trainer(
    model= model,
    args=training_args,
    train_dataset=tokenized_rc_train,
    eval_dataset=tokenized_rc_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=get_evaluate_fn(
        rc_dev,
        tokenized_rc_dev["context_span"],
        tokenized_rc_dev["offset_mapping"],
    ),
)

  squad_metric = load_metric("squad")


In [23]:
# do an initial evaluation
trainer.evaluate()

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.1210832595825195,
 'eval_exact_match': 30.0,
 'eval_f1': 34.61904761904761,
 'eval_runtime': 123.1073,
 'eval_samples_per_second': 0.812,
 'eval_steps_per_second': 0.032}

In [None]:
# perform training
trainer.train()
trainer.save_model()

In [None]:
def preprocess_validation_dataset(dataset: Dataset, tokenizer, max_length: int = 500, stride: int =128 ):
    

    def preprocess_function(examples, tokenizer, max_length):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=max_length,
            truncation="only_second",
            stride=stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # Create a map from overflowing tokens to the original sample they came from
        sample_map = inputs.pop("overflow_to_sample_mapping")
        inputs["example_id"] = [examples["id"][i] for i in sample_map]

        # Modify the offset mapping to only keep offsets for the context
        for i in range(len(inputs["input_ids"])):
            sequence_ids = inputs.sequence_ids(i)
            offset = inputs["offset_mapping"][i]
            inputs["offset_mapping"][i] = [
                o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
            ]

        return inputs

    return dataset.map(
        preprocess_function,
        fn_kwargs=dict(tokenizer=tokenizer, max_length=max_length),
        batched=True,
        remove_columns=dataset.column_names,
    )

In [None]:
'''the following code is for the prediction part for test data, without any answers labels.
compared with the prediction part for train and validation data, we don't need to compute the f1 score.
but it is more complicated, it chohoses the n best and set the max length value to improve the quality of answer'''

"""the code is modified based on the code from the following link, https://huggingface.co/learn/nlp-course/chapter7/7?fw=pt#postprocessing
turn discrete code into a function to simply the running."""

import collections


def predict(model_checkpoint, test_data):

    trained_model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    eval_set_for_model = test_data.remove_columns(["example_id", "offset_mapping"])
    eval_set_for_model.set_format("torch")

    batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names}

    with torch.no_grad():
        outputs = trained_model(**batch)

    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()

    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(test_data):
        example_to_features[feature["example_id"]].append(idx)

    n_best = 20
    max_answer_length = 30
    predicted = {}

    for example in rc_test:
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = test_data["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length.
                    if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(
                        {
                            "text": context[offsets[start_index][0]: offsets[end_index][1]],
                            "logit_score": start_logit[start_index] + end_logit[end_index],
                        }
                    )

        best_answer = max(answers, key=lambda x: x["logit_score"])
        predicted[example_id] = {"answers": {"text": [best_answer["text"]]}, "id": example_id}

    return predicted


In [None]:

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


#model_checkpoint ="deepset/roberta-large-squad2"
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#model =  DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)

model_checkpoint = "./results"

predicted = []

#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
for test_data in (rc_test, rc_final):
       test_data = preprocess_validation_dataset(test_data, tokenizer)
       predicted.append(predict(model_checkpoint, test_data))

with open('predicted_answers_finetuing_1.json', 'w') as f:
        json.dump(predicted[0], f)
with open('predicted_answers_finetuing_2.json', 'w') as f:
        json.dump(predicted[1], f)
        