In [None]:
# !pip install datasets transformers torch torchvision torchaudio evaluate tqdm requests>=2.32.1 accelerate transformers[sentencepiece] transformers[torch]
!pip install datasets transformers torch torchvision torchaudio evaluate tqdm
!pip install requests>=2.32.1
!pip install accelerate
!pip install transformers[sentencepiece]
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import Dataset, DatasetDict
import json
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
import tensorflow as tf
import torch
import numpy as np
import collections

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the SQuAD JSON files
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/train-v2.0.json') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json') as g:
    dev_data = json.load(g)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Function to transform the data into the required format
def transform_data(data):
    transformed_data = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }
    for article in data:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                transformed_data['id'].append(qa['id'])
                transformed_data['title'].append(title)
                transformed_data['context'].append(context)
                transformed_data['question'].append(qa['question'])
                transformed_data['answers'].append({
                    'text': [answer['text'] for answer in qa['answers']],
                    'answer_start': [answer['answer_start'] for answer in qa['answers']]
                })
    return transformed_data

# Transform the data
train_transformed = transform_data(train_data['data'])
dev_transformed = transform_data(dev_data['data'])

# Create Dataset objects and DatasetDict
raw_datasets = DatasetDict({
    'train': Dataset.from_dict(train_transformed),
    'validation': Dataset.from_dict(dev_transformed)
})

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [None]:
# model_checkpoint = "roberta-base" # if you want to start fresh
model_checkpoint = "ozgurkk/roberta-large-finetuned-squad" # if you want to take on a fine-tuned model
# model_checkpoint = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
#For this notebook to work with any kind of models, we need to account for the special case where
# the model expects padding on the left (in which case we switch the order of the question and the context):
pad_on_right = tokenizer.padding_side == "right"

In [None]:
# Function to prepare training features
def prepare_train_features(examples):
    pad_on_right = tokenizer.padding_side == "right"
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

In [None]:
# Check if the tokenizer has a CLS token
has_cls_token = tokenizer.cls_token is not None

# Retrieve the CLS token ID
cls_token_id = tokenizer.cls_token_id if has_cls_token else None

# Print out the CLS token and its ID
print(f"CLS Token: {tokenizer.cls_token}")
print(f"CLS Token ID: {cls_token_id}")

# Verify if the token ID is valid
if cls_token_id is not None:
    print(f"CLS Token ID is valid: {cls_token_id} is in the tokenizer vocabulary")
else:
    print("CLS Token is not available in the tokenizer")

CLS Token: <s>
CLS Token ID: 0
CLS Token ID is valid: 0 is in the tokenizer vocabulary


In [None]:
tokenized_datasets = raw_datasets.map(prepare_train_features, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
# we are now at the stage where preprocessing is done - we can continue with fine-tuning
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 131823
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 12165
    })
})

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# monitor gpu usage
!nvidia-smi

Thu Jul 11 11:25:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   34C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
batch_size = 16
# Training arguments
training_args = TrainingArguments(
    output_dir="roberta-large-finetuned-squad",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/tb_logs_roberta_large',
    logging_steps=500,
    save_steps=5000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    run_name="roberta-large-finetuning",
    remove_unused_columns=True,
    hub_model_id="ozgurkk/roberta-large-finetuned-squad",
    save_strategy="epoch",
    fp16=True,
    gradient_accumulation_steps=4,
    # warmup_steps=1000  # Adding warmup steps
    warmup_ratio=0.1  # 10% of the total steps are used for warmup
)



In [None]:
data_collator = default_data_collator

In [None]:
# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
0,0.7381,0.659502


TrainOutput(global_step=2059, training_loss=1.0646058518426866, metrics={'train_runtime': 6705.5381, 'train_samples_per_second': 19.659, 'train_steps_per_second': 0.307, 'total_flos': 9.17859694459945e+16, 'train_loss': 1.0646058518426866, 'epoch': 0.999635878140551})

In [None]:
# trainer.save_model("/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/roberta-base-finetuned-squad")

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/ozgurkk/roberta-large-finetuned-squad/commit/3687cb48b057344edec4e6fe525cb7571a127263', commit_message='End of training', commit_description='', oid='3687cb48b057344edec4e6fe525cb7571a127263', pr_url=None, pr_revision=None, pr_num=None)

We have one logit for each feature and each token.
The most obvious way to predict an answer for each feature is to take the index for the maximum of the start logits as a start position and the index of the maximum of the end logits as an end position.

This will work great in a lot of cases, but what if this prediction gives us something impossible: the start position could be greater than the end position, or point to a span of text in the question instead of the answer. In that case, we might want to look at the second best prediction to see if it gives a possible answer and select that instead.

However, picking the second best answer is not as easy as picking the best one: is it the second best index in the start logits with the best index in the end logits? Or the best index in the start logits with the second best index in the end logits? And if that second best answer is not possible either, it gets even trickier for the third best answer.

To classify our answers, we will use the score obtained by adding the start and end logits. We won't try to order all the possible answers and limit ourselves to with a hyper-parameter we call n_best_size. We'll pick the best indices in the start and end logits and gather all the answers this predicts. After checking if each one is valid, we will sort them by their score and keep the best one. Here is how we would do this on the first feature in the batch:

And then we can sort the `valid_answers` according to their `score` and only keep the best one. The only point left is how to check a given span is inside the context (and not the question) and how to get back the text inside. To do this, we need to add two things to our validation features:
- the ID of the example that generated the feature (since each example can generate several features, as seen before);
- the offset mapping that will give us a map from token indices to character positions in the context.

That's why we will re-process the validation set with the following function, slightly different from `prepare_train_features`:

In [None]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    pad_on_right = tokenizer.padding_side == "right"
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]
    return tokenized_examples

In [None]:
# apply preprocessing to the validation dataset
validation_features = raw_datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 12165
})

In [None]:
# get predictions for each feature
raw_predictions = trainer.predict(validation_features)

In [None]:
raw_predictions

PredictionOutput(predictions=(array([[  2.4824219,  -6.4023438,  -7.0703125, ..., -10.4453125,
        -10.4453125, -10.4453125],
       [  3.6210938,  -7.75     ,  -8.21875  , ..., -10.171875 ,
        -10.171875 , -10.171875 ],
       [  3.9863281,  -8.1015625,  -9.2265625, ..., -10.2109375,
        -10.2109375, -10.2109375],
       ...,
       [  6.       ,  -8.0859375, -10.1328125, ..., -10.3125   ,
        -10.3125   , -10.3125   ],
       [  6.1367188,  -9.390625 ,  -9.828125 , ..., -10.296875 ,
        -10.296875 , -10.296875 ],
       [  6.2148438,  -9.5390625,  -9.9140625, ..., -10.28125  ,
        -10.28125  , -10.28125  ]], dtype=float32), array([[  2.1855469,  -7.3984375,  -7.3007812, ..., -11.125    ,
        -11.125    , -11.125    ],
       [  3.3691406,  -8.1484375,  -8.640625 , ..., -10.859375 ,
        -10.859375 , -10.859375 ],
       [  3.6894531,  -9.3515625,  -9.5390625, ..., -10.9453125,
        -10.9453125, -10.9453125],
       ...,
       [  5.8203125,  -8.3906

In [None]:
# trainer hides the columns that are not used by the model
# here example_id and offset_mapping - which we will need for post processing
# we should get them back
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

We're almost ready for our post-processing function. The last bit to deal with is the impossible answer (when `squad_v2 = True`). The code above only keeps answers that are inside the context, we need to also grab the score for the impossible answer (which has start and end indices corresponding to the index of the CLS token). When one example gives several features, we have to predict the impossible answer when all the features give a high score to the impossible answer (since one feature could predict the impossible answer just because the answer isn't in the part of the context it has access too), which is why the score of the impossible answer for one example is the *minimum* of the scores for the impossible answer in each feature generated by the example.

We then predict the impossible answer when that score is greater than the score of the best non-impossible answer. All combined together, this gives us this post-processing function:

In [None]:
squad_v2 = True

In [None]:
# final_predictions = postprocess_qa_predictions(raw_datasets["validation"], validation_features, raw_predictions.predictions)

Post-processing 11873 example predictions split into 12165 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
def postprocess_qa_predictions(
    examples, features, raw_predictions, tokenizer, n_best_size=20, max_answer_length=30, squad_v2=True
):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()
    no_answer_probs = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated with the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = float('inf')  # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        # Looping through all the features associated with the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some positions in our logits to spans of texts in the original context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if feature_null_score < min_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": float(start_logits[start_index] + end_logits[end_index]),
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}

        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer
            no_answer_prob = softmax([best_answer["score"], min_null_score])[1]
            no_answer_probs[example["id"]] = float(no_answer_prob)

    if squad_v2:
        return predictions, no_answer_probs
    else:
        return predictions

In [None]:
final_predictions, na_probs_dict = postprocess_qa_predictions(
    raw_datasets["validation"], validation_features, raw_predictions.predictions, tokenizer, squad_v2=True
)

  0%|          | 0/11873 [00:00<?, ?it/s]

In [None]:
final_predictions

OrderedDict([('56ddde6b9a695914005b9628', 'France'),
             ('56ddde6b9a695914005b9629', '10th and 11th centuries'),
             ('56ddde6b9a695914005b962a', 'Denmark, Iceland and Norway'),
             ('56ddde6b9a695914005b962b', 'Rollo'),
             ('56ddde6b9a695914005b962c', '10th century'),
             ('5ad39d53604f3c001a3fe8d1', ''),
             ('5ad39d53604f3c001a3fe8d2', ''),
             ('5ad39d53604f3c001a3fe8d3', ''),
             ('5ad39d53604f3c001a3fe8d4', ''),
             ('56dddf4066d3e219004dad5f', 'William the Conqueror'),
             ('56dddf4066d3e219004dad60', 'Richard I of Normandy'),
             ('56dddf4066d3e219004dad61', 'Christian'),
             ('5ad3a266604f3c001a3fea27', ''),
             ('5ad3a266604f3c001a3fea28', ''),
             ('5ad3a266604f3c001a3fea29', ''),
             ('5ad3a266604f3c001a3fea2a', 'Richard I of Normandy'),
             ('5ad3a266604f3c001a3fea2b', ''),
             ('56dde0379a695914005b9636', 'Norseman, Vik

In [None]:
from datasets import load_metric
metric = load_metric("squad_v2" if squad_v2 else "squad")

  metric = load_metric("squad_v2" if squad_v2 else "squad")


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

The repository for squad_v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/squad_v2.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
if squad_v2:
    formatted_predictions = [
        {"id": k, "prediction_text": v, "no_answer_probability": na_probs_dict[k]}
        for k, v in final_predictions.items()
    ]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]

In [None]:
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in raw_datasets["validation"]]

In [None]:
# Compute the metric
metric_result = metric.compute(predictions=formatted_predictions, references=references)
print(metric_result)

In [None]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 12165
})

In [None]:
formatted_predictions

[{'id': '56ddde6b9a695914005b9628',
  'prediction_text': 'France',
  'no_answer_probability': 2.4876490010363496e-05},
 {'id': '56ddde6b9a695914005b9629',
  'prediction_text': '10th and 11th centuries',
  'no_answer_probability': 0.0002689534841585773},
 {'id': '56ddde6b9a695914005b962a',
  'prediction_text': 'Denmark, Iceland and Norway',
  'no_answer_probability': 0.015483987303625265},
 {'id': '56ddde6b9a695914005b962b',
  'prediction_text': 'Rollo',
  'no_answer_probability': 0.11162799104060678},
 {'id': '56ddde6b9a695914005b962c',
  'prediction_text': '10th century',
  'no_answer_probability': 8.597842999974016e-05},
 {'id': '5ad39d53604f3c001a3fe8d1',
  'prediction_text': '',
  'no_answer_probability': 0.9999461154717781},
 {'id': '5ad39d53604f3c001a3fe8d2',
  'prediction_text': '',
  'no_answer_probability': 0.9859092655162919},
 {'id': '5ad39d53604f3c001a3fe8d3',
  'prediction_text': '',
  'no_answer_probability': 0.9999393720159937},
 {'id': '5ad39d53604f3c001a3fe8d4',
  'pre

In [None]:
predictions_dict = {pred['id']: pred['prediction_text'] for pred in formatted_predictions}

In [None]:
# Create the no-answer probabilities dictionary
na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in formatted_predictions}

In [None]:
def save_predictions(predictions, filename):
    with open(filename, 'w') as f:
        json.dump(predictions, f)

In [None]:
# Save the predictions and no-answer probabilities to JSON files
save_predictions(predictions_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/predictions_roberta_large.json')
save_predictions(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_large.json')

In [None]:
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/ozgurkk/roberta-large-finetuned-squad/commit/2f03dbf6187571b1bd1b11cbe0380ccfc2dc23f4', commit_message='Training complete', commit_description='', oid='2f03dbf6187571b1bd1b11cbe0380ccfc2dc23f4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Paths to the evaluation script and data files
eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/predictions_roberta_large.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_large.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_large.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_large'

In [None]:
# Command to run the evaluation script
command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

In [None]:
# Use the evaluation script
import subprocess
subprocess.run(command, check=True)

CompletedProcess(args=['python', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/predictions_roberta_large.json', '-n', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_large.json', '-o', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_large.json', '-p', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_large'], returncode=0)

In [None]:
import json

In [None]:
# Load the predictions and no-answer probabilities
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/predictions_roberta_v2.json') as f:
    predictions = json.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/na_probs_roberta_v2.json') as f:
    na_probs = json.load(f)


In [None]:
# Inspect some predictions and their no-answer probabilities
for qid, pred in list(predictions.items())[:10]:
    print(f"Question ID: {qid}")
    print(f"Prediction: {pred}")
    print(f"No-Answer Probability: {na_probs[qid]}")
    print()

Question ID: 56ddde6b9a695914005b9628
Prediction: France
No-Answer Probability: 1.2380037591258592e-09

Question ID: 56ddde6b9a695914005b9629
Prediction: 10th and 11th centuries
No-Answer Probability: 9.307120690588079e-11

Question ID: 56ddde6b9a695914005b962a
Prediction: Denmark, Iceland and Norway
No-Answer Probability: 0.0009584648438362855

Question ID: 56ddde6b9a695914005b962b
Prediction: Rollo
No-Answer Probability: 6.096360547613931e-06

Question ID: 56ddde6b9a695914005b962c
Prediction: 10th
No-Answer Probability: 2.754689953456816e-09

Question ID: 5ad39d53604f3c001a3fe8d1
Prediction: 
No-Answer Probability: 0.9999999838824869

Question ID: 5ad39d53604f3c001a3fe8d2
Prediction: 
No-Answer Probability: 0.9999963950646829

Question ID: 5ad39d53604f3c001a3fe8d3
Prediction: West Francia
No-Answer Probability: 0.1364357087286803

Question ID: 5ad39d53604f3c001a3fe8d4
Prediction: 
No-Answer Probability: 0.9999999998804943

Question ID: 56dddf4066d3e219004dad5f
Prediction: William the

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# now it seems like the model is working fine - lets check some answers first and then build and interface
# test some questions and answers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load the fine-tuned model
model_name = "ozgurkk/roberta-base-finetuned-squad"  # Replace with your model path or name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Initialize the QA pipeline
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Define a function to handle no-answer logic
def get_answer(question, context, threshold=0.5):
    result = qa_pipeline(question=question, context=context)
    answer = result['answer']
    score = result['score']
    start = result['start']
    end = result['end']

    # Apply the no-answer logic
    if score < threshold or start == end:
        return {"answer": "", "score": score, "no_answer_probability": 1 - score}
    else:
        return {"answer": answer, "score": score, "no_answer_probability": 1 - score}

# Test examples
context = "The Eiffel Tower is located in Paris and is one of the most famous landmarks in the world."
question_1 = "Where is the Eiffel Tower located?"
question_2 = "Who designed the Statue of Liberty?"

print(get_answer(question_1, context))
print(get_answer(question_2, context))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

{'answer': 'Paris', 'score': 0.9989422559738159, 'no_answer_probability': 0.001057744026184082}
{'answer': '', 'score': 1.0751947110421156e-09, 'no_answer_probability': 0.9999999989248053}


In [None]:
# lets move on with the interface
!pip install transformers gradio

Collecting gradio
  Downloading gradio-4.37.2-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.2 (from gradio)
  Downloading gradio_client-1.0.2-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.2/318.2 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import gradio as gr

def qa_interface(question, context):
    result = get_answer(question, context)
    return result['answer'], result['score'], result['no_answer_probability']

# Create Gradio interface
iface = gr.Interface(
    fn=qa_interface,
    inputs=[gr.Textbox(lines=2, label="Question"), gr.Textbox(lines=5, label="Context")],
    outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Score"), gr.Textbox(label="No Answer Probability")],
    title="Question Answering Model with No-Answer Handling",
    description="Ask a question based on the provided context. The model will return the answer, the confidence score, and the probability of no answer."
)

In [None]:
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6aabd11781340a4862.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


