In [None]:
!pip install datasets transformers torch torchvision torchaudio evaluate tqdm
!pip install requests>=2.32.2
!pip install accelerate
!pip install transformers[sentencepiece]
!pip install transformers[torch]



In [None]:
import datasets
import json
import numpy as np
import evaluate
import torch
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import collections
from tqdm.auto import tqdm
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the SQuAD JSON files
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/train-v2.0.json') as train:
    train_data = json.load(train)
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json') as dev:
    dev_data = json.load(dev)

# Extract the 'data' field which contains the actual dataset
training_data = train_data['data']
validation_data = dev_data['data']

# # Verify the structure of the loaded data
# print("Sample from training data:")
# print(json.dumps(training_data[0], indent=2))


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Function to transform the data into the required format
def transform_data(data):
    transformed_data = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }
    for article in data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                transformed_data['id'].append(qa['id'])
                transformed_data['title'].append(title)
                transformed_data['context'].append(context)
                transformed_data['question'].append(qa['question'])
                transformed_data['answers'].append({
                    'text': [answer['text'] for answer in qa['answers']],
                    'answer_start': [answer['answer_start'] for answer in qa['answers']]
                })
    return transformed_data

# Transform the data
train_transformed = transform_data(train_data)
dev_transformed = transform_data(dev_data)

# Create Dataset objects
train_dataset = Dataset.from_dict(train_transformed)
dev_dataset = Dataset.from_dict(dev_transformed)

# Create a DatasetDict with the Dataset objects to prepare it processing
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset
})

In [None]:
# Transform the data
train_transformed = transform_data(train_data)
dev_transformed = transform_data(dev_data)

# Create Dataset objects
train_dataset = Dataset.from_dict(train_transformed)
dev_dataset = Dataset.from_dict(dev_transformed)

# Create a DatasetDict with the Dataset objects to prepare it processing
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset
})

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [None]:
def remove_empty_answers(dataset):
    def is_valid_example(example):
        return len(example['answers']['text']) > 0

    return dataset.filter(is_valid_example)

In [None]:
# filtering out examples (question-answer pairs) where there are no answers.
# Converts the given SQUAD 2.0 dataset into SQUAD 1.0 dataset

train_dataset_cleaned = remove_empty_answers(raw_datasets["train"])

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [None]:
# bring the train and validation sets together in a single dictionary

raw_datasets_processed = DatasetDict({
    'train': train_dataset_cleaned,
    'validation': raw_datasets["validation"]
})

In [None]:
# Check the structure
print("Context: ", raw_datasets_processed["train"][3]["context"])
print("Question: ", raw_datasets_processed["train"][3]["question"])
print("Answer: ", raw_datasets_processed["train"][3]["answers"])

Context:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question:  In what city and state did Beyonce  grow up? 
Answer:  {'answer_start': [166], 'text': ['Houston, Texas']}


In [None]:
raw_datasets_processed["validation"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [None]:
# Overall, this code is initializing a tokenizer specifically designed
# for the "roberta-base" pre-trained model. This tokenizer will be
# used to process text data in a way that the model can understand.

model_checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
# For this notebook to work with any kind of models, we need to account for the special case where
# the model expects padding on the left (in which case we switch the order of the question and the context):
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
tokenized_datasets = raw_datasets_processed.map(prepare_train_features, batched=True, remove_columns=raw_datasets_processed["train"].column_names)

Map:   0%|          | 0/86821 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
# Load the pretrained model for question answering.

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [None]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
)




In [None]:
# make sure GPU is ready
import tensorflow as tf
tf.test.gpu_device_name()
tf.config.list_physical_devices('GPU')

'/device:GPU:0'

In [None]:
from transformers import default_data_collator
data_collator = default_data_collator

In [None]:
# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8659,5.364106


TrainOutput(global_step=5487, training_loss=0.9881501695769547, metrics={'train_runtime': 6483.6298, 'train_samples_per_second': 13.539, 'train_steps_per_second': 0.846, 'total_flos': 1.7202276007147008e+16, 'train_loss': 0.9881501695769547, 'epoch': 1.0})

In [None]:
trainer.save_model("squad-trained-1epoch")

NameError: name 'trainer' is not defined

In [None]:
trainer.push_to_hub("f-arnold/roberta-finetuned-squad-1epoch")

CommitInfo(commit_url='https://huggingface.co/f-arnold/roberta-base-finetuned-squad/commit/62dec1deb2dd1c6c14267ffd7386c65f5bb21586', commit_message='f-arnold/roberta-finetuned-squad-1epoch', commit_description='', oid='62dec1deb2dd1c6c14267ffd7386c65f5bb21586', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Load the pretrained model for question answering.

model = AutoModelForQuestionAnswering.from_pretrained("f-arnold/roberta-base-finetuned-squad")

In [None]:
# Define a function that generates predictions from the model.

def make_predictions(model, dataset, batch_size=16, no_answer_threshold=0.5):
    model.eval()
    model.to("cuda")  # Move the model to GPU
    predictions = []

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size)):
            batch = dataset[i:i + batch_size]

            # Prepare inputs and move them to GPU
            inputs = {
                key: torch.tensor(val).to("cuda")
                for key, val in batch.items()
                if key in ["input_ids", "attention_mask"]
            }
            outputs = model(**inputs)

            start_logits = outputs.start_logits.cpu().numpy()
            end_logits = outputs.end_logits.cpu().numpy()

            # Calculate no-answer probability
            no_answer_prob = F.softmax(torch.tensor((start_logits[:, 0] + end_logits[:, 0]) / 2), dim=-1).cpu().numpy()

            for j, example_id in enumerate(batch["example_id"]):
                start_idx = np.argmax(start_logits[j])
                end_idx = np.argmax(end_logits[j])
                no_ans_prob = no_answer_prob[j]

                # Check if the no-answer probability exceeds the threshold
                if no_ans_prob > no_answer_threshold:
                    prediction = (example_id, None, None, no_ans_prob)
                else:
                    prediction = (example_id, start_idx, end_idx, no_ans_prob)
                predictions.append(prediction)

    return predictions


In [None]:
def postprocess_predictions(dataset, predictions):
    # Aggregate predictions by example_id
    example_predictions = collections.defaultdict(list)
    for example, (example_id, start_idx, end_idx, no_answer_prob) in zip(dataset, predictions):
        offsets = example["offset_mapping"]
        context = example["context"]

        if start_idx is None or end_idx is None or start_idx >= len(offsets) or end_idx >= len(offsets):
            text = ""
        else:
            while start_idx < len(offsets) and offsets[start_idx] is None:
                start_idx += 1
            while end_idx >= 0 and offsets[end_idx] is None:
                end_idx -= 1

            if start_idx >= len(offsets) or end_idx < 0 or offsets[start_idx] is None or offsets[end_idx] is None:
                text = ""
            else:
                start_char = offsets[start_idx][0]
                end_char = offsets[end_idx][1]
                text = context[start_char:end_char]

        example_predictions[example_id].append({
            "text": text,
            "no_answer_prob": no_answer_prob
        })

    # Choose the best prediction per example_id (e.g., the one with the lowest no_answer_prob)
    formatted_predictions = [
        {
            "id": example_id,
            "answer": min(preds, key=lambda x: x["no_answer_prob"])["text"],
            "no_answer_probability": min(preds, key=lambda x: x["no_answer_prob"])["no_answer_prob"]
        }
        for example_id, preds in example_predictions.items()
    ]

    return formatted_predictions

In [None]:
import torch.nn.functional as F

In [None]:
# preprocesses validation data for question answering in a similar way to the training function.
# It performs tokenization, handles potential splitting due to the sliding window,
# and prepares the data for evaluation by the model.

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    contexts = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        contexts.append(examples["context"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    inputs["context"] = contexts
    return inputs

In [None]:
max_length = 384
stride = 128
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
predictions = make_predictions(model, validation_dataset)

  0%|          | 0/761 [00:00<?, ?it/s]

In [None]:
predictions

[('56ddde6b9a695914005b9628', 53, 53, 0.03785553),
 ('56ddde6b9a695914005b9629', 39, 44, 0.05775123),
 ('56ddde6b9a695914005b962a', 77, 81, 0.066653505),
 ('56ddde6b9a695914005b962b', 85, 86, 0.057829574),
 ('56ddde6b9a695914005b962c', 164, 165, 0.054816753),
 ('5ad39d53604f3c001a3fe8d1', 16, 18, 0.05132028),
 ('5ad39d53604f3c001a3fe8d2', 48, 48, 0.03750651),
 ('5ad39d53604f3c001a3fe8d3', 102, 104, 0.045478437),
 ('5ad39d53604f3c001a3fe8d4', 156, 162, 0.07493337),
 ('56dddf4066d3e219004dad5f', 209, 212, 0.054222982),
 ('56dddf4066d3e219004dad60', 124, 127, 0.17690685),
 ('56dddf4066d3e219004dad61', 43, 43, 0.052062754),
 ('5ad3a266604f3c001a3fea27', 23, 27, 0.05197306),
 ('5ad3a266604f3c001a3fea28', 32, 34, 0.042466108),
 ('5ad3a266604f3c001a3fea29', 31, 33, 0.046454336),
 ('5ad3a266604f3c001a3fea2a', 123, 124, 0.091768704),
 ('5ad3a266604f3c001a3fea2b', 180, 180, 0.036447797),
 ('56dde0379a695914005b9636', 99, 103, 0.032027263),
 ('56dde0379a695914005b9637', 94, 96, 0.22896332),
 ('5a

In [None]:
final_predictions = postprocess_predictions(validation_dataset, predictions)

In [None]:
final_predictions

[{'id': '56ddde6b9a695914005b9628',
  'answer': 'France',
  'no_answer_probability': 0.03785553},
 {'id': '56ddde6b9a695914005b9629',
  'answer': '10th and 11th centuries',
  'no_answer_probability': 0.05775123},
 {'id': '56ddde6b9a695914005b962a',
  'answer': 'Denmark, Iceland and Norway',
  'no_answer_probability': 0.066653505},
 {'id': '56ddde6b9a695914005b962b',
  'answer': 'Rollo',
  'no_answer_probability': 0.057829574},
 {'id': '56ddde6b9a695914005b962c',
  'answer': '10th',
  'no_answer_probability': 0.054816753},
 {'id': '5ad39d53604f3c001a3fe8d1',
  'answer': 'The Normans',
  'no_answer_probability': 0.05132028},
 {'id': '5ad39d53604f3c001a3fe8d2',
  'answer': 'Normandy',
  'no_answer_probability': 0.03750651},
 {'id': '5ad39d53604f3c001a3fe8d3',
  'answer': 'West Francia',
  'no_answer_probability': 0.045478437},
 {'id': '5ad39d53604f3c001a3fe8d4',
  'answer': 'first half of the 10th century',
  'no_answer_probability': 0.07493337},
 {'id': '56dddf4066d3e219004dad5f',
  'ans

In [None]:
# Create the no-answer probabilities dictionary
na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}

# Convert to dictionary format
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

In [None]:
def save_predictions(predictions, filename):
    with open(filename, 'w') as f:
        json.dump(predictions, f)

In [None]:
def save_na_probs(na_probs_dict, filename):
    # Convert numpy float32 to regular Python floats for JSON serialization
    modified_dict = {k: float(v) for k, v in na_probs_dict.items()}
    with open(filename, 'w') as f:
        json.dump(modified_dict, f, indent=2)

In [None]:
# Save your predictions to a file in your mounted drive
save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_finetuned.json')


In [None]:
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_finetuned.json')

In [None]:
# Paths to the evaluation script and data files
eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_finetuned.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_finetuned.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_finetuned.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_finetuned'

In [None]:
# Command to run the evaluation script
command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

In [None]:
import subprocess
# Use the evaluation script
subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

CompletedProcess(args=['python', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_finetuned.json', '-n', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_finetuned.json', '-o', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_finetuned.json', '-p', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_finetuned'], returncode=0, stdout=b'', stderr=b'')