In [16]:
import os
import random
import ast
import collections
import numpy as np
import pandas as pd
import spacy
from fastcoref import spacy_component
from tqdm import tqdm
import torch
import datasets
from datasets import load_dataset, load_metric
import transformers
from transformers import default_data_collator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from warnings import filterwarnings

filterwarnings("ignore")

In [17]:
# Set seeds for reproducible results
random.seed(420)
np.random.seed(420)
torch.manual_seed(420)
transformers.set_seed(420)

MAX_LENGTH = 512 # Maximum number of tokens in a sequence (tokenizer parameter)
STRIDE = 128 # Stride when splitting the context into smaller parts (tokenizer parameter)

# Coreference Resolution with fastcoref model LingMessCoref

Coreference resolution helps link expressions that refer to the same entity. Doing this can help a model perform well on information retrieval tasks. Let's create a function that performs coreference resolution and replaces all the expressions with the entities that they refer to.

In [3]:
if not os.path.isdir("./data/squad_resolved"): # if resolved text data is not saved
    # Load the spaCy English language model
    nlp = spacy.load("en_core_web_sm")

    # Add the fastcoref component to the spaCy pipeline
    nlp.add_pipe("fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': "cpu"})

    def replace_anaphors_and_cataphors(text):
        # Process the input text with spaCy
        doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
        return doc._.resolved_text

# Data Preprocessing

We'll use the SQUAD dataset to train our model since it is an annotated dataset. Information Retrieval is the task of retrieving information from a document. Using SQUAD, we'll perform extractive question answering which is a form of informationr retrieval since we have to identify the span of text in the passage that answers a question related to the passage.

Squad v1.1 has a lot of rows. Even with a GPU using colab free, it is difficult to resolve the text using coreference resolution and train on all the rows. Let's take every 7th row for both the train and validation datasets.

Coreference resolution is performed on the squad subset and saved so that there is no need to resolve the text everytime this script is run. The saved data can be loaded straightaway. It is better to save the unresolved squad as well and load it directly. Due to version errors and library issues, the dataset is saved as a csv dataframe using pandas instead of using datasets.save_to_disk(). Errors were encountered while using datasets.load_from_disk() and the dataset wouldn't load.

In [18]:
do_coref = False

# if the data is already preprocessed and saved, load it
if os.path.isdir("./data/squad_resolved") and os.path.isdir("./data/squad_unresolved"):
    if do_coref: # if resolved data is needed
        train_df = pd.read_csv("./data/squad_resolved/train_resolved.csv") # load the train dataset
        val_df = pd.read_csv("./data/squad_resolved/val_resolved.csv") # load the val dataset
        for idx in range(len(train_df["answers"])): # for all answers
            # convert answers type from string to dict
            train_df["answers"][idx] = ast.literal_eval(train_df["answers"][idx])
        for idx in range(len(val_df["answers"])): # for all answers
            # convert answers type from string to dict
            val_df["answers"][idx] = ast.literal_eval(val_df["answers"][idx])
        squad = datasets.DatasetDict(
            {
                "train": datasets.Dataset.from_pandas(train_df), # train dataset
                "validation": datasets.Dataset.from_pandas(val_df), # val dataset
            }
        )
    else: # load the unresolved dataaset 
        train_df = pd.read_csv("./data/squad_unresolved/train_unresolved.csv") # load the train dataset
        val_df = pd.read_csv("./data/squad_unresolved/val_unresolved.csv") # load the val dataset
        for idx in range(len(train_df["answers"])): # for all answers 
            # convert answers type from string to dict
            train_df["answers"][idx] = ast.literal_eval(train_df["answers"][idx])
        for idx in range(len(val_df["answers"])): # for all answers
            # convert answers type from string to dict
            val_df["answers"][idx] = ast.literal_eval(val_df["answers"][idx])
        squad = datasets.DatasetDict( # create a dataset dict
            {
                "train": datasets.Dataset.from_pandas(train_df), # train dataset
                "validation": datasets.Dataset.from_pandas(val_df), # val dataset
            }
        )
else:  # save datasets 
    squad = load_dataset("squad") # load squad dataset
    # shuffle and select every 7th row for both train and val splits
    squad["train"]= squad["train"].shuffle(seed=420).select(range(0, squad["train"].num_rows, 7))
    squad["val"] = squad["val"].shuffle(seed=420).select(range(0, squad["val"].num_rows, 7))
    train_unresolved = pd.DataFrame(squad["train"]) # create a dataframe from the train dataset (unresolved)
    val_unresolved = pd.DataFrame(squad["validation"]) # create a dataframe from the val dataset (unresolved)
    train_unresolved.to_csv("./data/squad_unresolved/train_unresolved.csv", index=False) # save the train dataset (unresolved)
    val_unresolved.to_csv("./data/squad_unresolved/val_unresolved.csv", index=False) # save the val dataset (unresolved)

    if do_coref: # if text needs to be resolved
        for i in tqdm(range(len(squad["train"]))): # for each context in the train dataset
            # resolve text
            squad["train"][i]["context"] = replace_anaphors_and_cataphors(squad["train"][i]["context"])

        for i in tqdm(range(len(squad["validation"]))): # for each context in the val dataset
            # resolve text
            squad["validation"][i]["context"] = replace_anaphors_and_cataphors(squad["validation"][i]["context"])

        train_resolved = pd.DataFrame(squad["train"]) # create a dataframe from the train dataset (resolved)
        val_resolved = pd.DataFrame(squad["validation"]) # create a dataframe from the val dataset (resolved)
        train_resolved.to_csv("./data/squad_resolved/train_resolved.csv", index=False) # save the train dataset (resolved)
        val_resolved.to_csv("./data/squad_resolved/val_resolved.csv", index=False) # save the val dataset (resolved)

Let's take a look at a random example

In [6]:
n = random.randint(0, squad["train"].num_rows)
print(f"Here is a random SQuAD training set example (row number = {n}): ")
print(squad["train"][n]["context"])
print(squad["train"][n]["question"])
print(squad["train"][n]["answers"])

Here is a random SQuAD training set example (row number = 431): 
Freemasonry, as it exists in various forms all over the world, has a membership estimated by the United Grand Lodge of England at around six million worldwide. The fraternity is administratively organised into independent Grand Lodges (or sometimes Grand Orients), each of which governs its own Masonic jurisdiction, which consists of subordinate (or constituent) Lodges. The largest single jurisdiction, in terms of membership, is the United Grand Lodge of England (with a membership estimated at around a quarter million). The Grand Lodge of Scotland and Grand Lodge of Ireland (taken together) have approximately 150,000 members. In the United States total membership is just under two million.
How many members does the Grand Lodge of Scotland and the Grand Lodge of Ireland have, together?
{'answer_start': [616], 'text': ['150,000']}


# Model Training and Evaluation

Skip this part if you've already trained and evaluated the model. Jump ahead to the "System Testing" section.

## Training

Now let's load the pre-trained model and tokenizer from huggingface. RoBERTa is used instead of BERT since RoBERTa is an optimized version of BERT.

In [17]:
model_checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [8]:
def prepare_train_features(sample, max_length = MAX_LENGTH, stride = STRIDE):
    # some questions have leading spaces which need to be removed
    sample["question"] = [q.lstrip() for q in sample["question"]]
    # Tokenize the sample with truncation and padding, but keep the overflows using a stride.
    # When a context is long, it will be chunked into several features with an overlap of 128 tokens.
    tokenized_sample = tokenizer(
        sample["question"],
        sample["context"],
        truncation = "only_second",  # truncate context, not the question
        max_length=max_length, # max length of the text that can go to the model
        stride=stride, # overlap of 128 tokens when a context is chunked into several features
        return_overflowing_tokens=True, # return all the chunks if the context is long
        return_offsets_mapping=True, # return the mapping between the tokens and the character positions
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_sample.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_sample.pop("offset_mapping")

    # Let's label those examples!
    tokenized_sample["start_positions"] = []
    tokenized_sample["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_sample["input_ids"][i] # get the input_ids

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_sample.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = sample["answers"][sample_index]

        # Start/end character index of the answer in the text.
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        tokenized_sample["start_positions"].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        tokenized_sample["end_positions"].append(token_end_index + 1)

    return tokenized_sample

In [19]:
tokenized_datasets = squad.map(prepare_train_features, batched=True,
                               remove_columns=squad["train"].column_names) # tokenize datasets

Map: 100%|██████████| 12515/12515 [00:03<00:00, 3724.86 examples/s]
Map: 100%|██████████| 1510/1510 [00:00<00:00, 3763.96 examples/s]


In [20]:
if torch.cuda.is_available(): # if cuda is available
    torch.cuda.set_device(0) # set device to cuda

model_name = model_checkpoint.split("/")[-1] # set model name
batch_size = 4 # set batch size
training_args = TrainingArguments( # set training arguments
    f"{model_name}-finetuned-squad-coref-{do_coref}", # set output directory name
    evaluation_strategy = "epoch", # evaluate after each epoch
    learning_rate=2e-5, # set learning rate
    gradient_accumulation_steps=4, # set gradient accumulation steps
    per_device_train_batch_size=batch_size, # set batch size
    per_device_eval_batch_size=batch_size, # set batch size
    num_train_epochs=3, # set number of epochs
    weight_decay=0.01, # set weight decay
    push_to_hub=False, # do not push to hub
    disable_tqdm=False, # enable tqdm
)

In [21]:
# define data collator
# Data collator converts the dataset into a batch of tensors
data_collator = default_data_collator

In [22]:
trainer = Trainer( # create a trainer
    model, # pass the model
    training_args, # pass the training arguments
    train_dataset=tokenized_datasets["train"], # pass the train dataset
    eval_dataset=tokenized_datasets["validation"], # pass the validation dataset
    data_collator=data_collator, # pass the data collator
    tokenizer=tokenizer, # pass the tokenizer
)

print(trainer.args.device) # print trainer device

mps


In [None]:
trainer.train() # train using trainer

In [None]:
if do_coref: # if resolved text is used
    trainer.save_model("./roberta-squad-resolved") # save the resolved text model
else: # if unresolved text is used
    trainer.save_model("./roberta-squad-unresolved") # save the unresolved text model

Please delete any other directory created by the trainer since that will contain models trained in each epoch.

# Validation

The validation dataset needs some processing since we need to map the output logits to the start and end positions in text and further to the predicted answer text.

In [19]:
if do_coref: # if resolved text is used
    tokenizer = AutoTokenizer.from_pretrained("./roberta-squad-resolved") # load the resolved text tokenizer
    model = AutoModelForQuestionAnswering.from_pretrained("./roberta-squad-resolved") # load the resolved text model
    trainer = Trainer(model=model, tokenizer=tokenizer) # create a trainer
else: # if unresolved text is used
    tokenizer = AutoTokenizer.from_pretrained("./roberta-squad-unresolved") # load the unresolved text tokenizer
    model = AutoModelForQuestionAnswering.from_pretrained("./roberta-squad-unresolved") # load the unresolved text model
    trainer = Trainer(model=model, tokenizer=tokenizer) # create a trainer

In [20]:
def prepare_validation_features(examples, max_length = MAX_LENGTH, stride = STRIDE):
    # some questions have whitespace on the left which needs to be removed
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # tokenize the examples in the same way as the training set
    tokenized_examples = tokenizer(
        examples["question"], # get question
        examples["context"], # get context
        truncation="only_second", # truncate context, not the question
        max_length=max_length, # max length of the text that can go to the model
        stride=stride, # overlap of 128 tokens when a context is chunked into several features
        return_overflowing_tokens=True, # return all the chunks if the context is long
        return_offsets_mapping=True, # return the mapping between the tokens and the character positions
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [21]:
validation_features = squad["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=squad["validation"].column_names
)

Map: 100%|██████████| 1510/1510 [00:00<00:00, 2892.37 examples/s]


In [22]:
output = trainer.predict(validation_features) # get predictions
# set back hidden columns
validation_features.set_format(type=validation_features.format["type"], 
                               columns=list(validation_features.features.keys()))

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


100%|██████████| 190/190 [01:32<00:00,  2.05it/s]


In [23]:
# map examples with corresponding features
examples = squad["validation"]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

n_best_size will be used to get all possible answers from different start and end positions since the best answer can be a wrong prediction (a span present in the question or outside the context). max_answer_length will eliminate extremely long answers.

In [24]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick the final answer
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [25]:
final_predictions = postprocess_qa_predictions(squad["validation"], validation_features, output.predictions)

Post-processing 1510 example predictions split into 1517 features.


100%|██████████| 1510/1510 [00:02<00:00, 677.23it/s]


In [26]:
metric = load_metric("squad") # load squad metric

In [27]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in squad["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 81.52317880794702, 'f1': 89.27663783927457}

In [28]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 12515
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1510
    })
})

# System Testing

In [29]:
nlp = spacy.load("en_core_web_sm")

# Add the fastcoref component to the spaCy pipeline
nlp.add_pipe("fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': "cpu"})

def replace_anaphors_and_cataphors(text):
    # Process the input text with spaCy
    doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
    return doc._.resolved_text

03/30/2024 11:26:09 - INFO - 	 missing_keys: []
03/30/2024 11:26:09 - INFO - 	 unexpected_keys: []
03/30/2024 11:26:09 - INFO - 	 mismatched_keys: []
03/30/2024 11:26:09 - INFO - 	 error_msgs: []
03/30/2024 11:26:09 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


In [30]:
is_finetuned = input("Selected fine-tuned model? (Y if model is fine-tuned using the code above, N if not)")
is_finetuned = True if is_finetuned.lower() == "y" else False
if is_finetuned:
    is_coref = input("With coreference resolution? (Y if you want to resolve the text, N if not)")
    is_coref = True if is_coref.lower() == "y" else False
    if is_coref:
        tokenizer = AutoTokenizer.from_pretrained("./roberta-squad-resolved")
        model = AutoModelForQuestionAnswering.from_pretrained("./roberta-squad-resolved")
    else:
        tokenizer = AutoTokenizer.from_pretrained("./roberta-squad-unresolved")
        model = AutoModelForQuestionAnswering.from_pretrained("./roberta-squad-unresolved")
else:
    is_coref = False
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
    model = AutoModelForQuestionAnswering.from_pretrained("FacebookAI/roberta-base")

model.to("cpu")
print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [32]:
context = """Opera refers to a dramatic art form, originating in Europe, in which the 
emotional content is conveyed to the audience as much through music, both vocal and instrumental, 
as it is through the lyrics. By contrast, in musical theater an actor's dramatic performance 
is primary, and the music plays a lesser role. The drama in opera is presented using the 
primary elements of theater such as scenery, costumes, and acting. However, the words of the 
opera, or libretto, are sung rather than spoken. The singers are accompanied by a musical 
ensemble ranging from a small instrumental ensemble to a full symphonic orchestra."""

question = "How is drama presented in Opera?"

change = input("Do you want to input your own context and question or continue with the sample provided? (Y if you want to input your own context and question, N if not)")
change = True if change.lower() == "y" else False
if change:
    context = input("Enter the context: ")
    question = input("Enter the question: ")

if is_coref:
    context = replace_anaphors_and_cataphors(context)

inputs = tokenizer(question, context, return_tensors="pt").to("cpu")

with torch.no_grad():
    outputs = model(inputs.input_ids, attention_mask=inputs.attention_mask)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True).lstrip().replace("\n", "") # clean answer

03/30/2024 11:27:24 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 80.31 examples/s]
03/30/2024 11:27:24 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


'using the primary elements of theater such as scenery, costumes, and acting'

To test all the combinations, simply re-run the last 2 code cells and specify the configuration you want.. After testing all the combinations (pretrained roberta, finetuned roberta with unresolved text, finetuned roberta with resolved text), you'll see the only the finetuned combinations give the right answer.