In [3]:
!pip install datasets transformers torch torchvision torchaudio evaluate tqdm
# !pip install requests==2.31.0
!pip install requests>=2.32.1
!pip install accelerate
!pip install transformers[sentencepiece]
!pip install accelerate -U
!pip install transformers[torch]




Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import datasets
import json
import numpy as np
import evaluate
import torch
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import collections
from tqdm.auto import tqdm
from transformers import TrainingArguments
from transformers import Trainer

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Load the SQuAD JSON files
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/train-v2.0.json') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json') as g:
    dev_data = json.load(g)

In [7]:
# Extract the 'data' field which contains the actual dataset
training_data = train_data['data']
validation_data = dev_data['data']

# Verify the structure of the loaded data
print("Sample from training data:")
print(json.dumps(training_data[0], indent=2))

Sample from training data:
{
  "title": "Beyonc\u00e9",
  "paragraphs": [
    {
      "qas": [
        {
          "question": "When did Beyonce start becoming popular?",
          "id": "56be85543aeaaa14008c9063",
          "answers": [
            {
              "text": "in the late 1990s",
              "answer_start": 269
            }
          ],
          "is_impossible": false
        },
        {
          "question": "What areas did Beyonce compete in when she was growing up?",
          "id": "56be85543aeaaa14008c9065",
          "answers": [
            {
              "text": "singing and dancing",
              "answer_start": 207
            }
          ],
          "is_impossible": false
        },
        {
          "question": "When did Beyonce leave Destiny's Child and become a solo singer?",
          "id": "56be85543aeaaa14008c9066",
          "answers": [
            {
              "text": "2003",
              "answer_start": 526
            }
          ],
   

In [8]:
# Overall, this function takes raw data with articles, questions, and answers,
# and transforms it into a structured dictionary separating titles, contexts,
# questions, answer texts, and answer starting positions.

# Function to transform the data into the required format
def transform_data(data):
    transformed_data = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }
    for article in data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                transformed_data['id'].append(qa['id'])
                transformed_data['title'].append(title)
                transformed_data['context'].append(context)
                transformed_data['question'].append(qa['question'])
                transformed_data['answers'].append({
                    'text': [answer['text'] for answer in qa['answers']],
                    'answer_start': [answer['answer_start'] for answer in qa['answers']]
                })
    return transformed_data


In [9]:
# Transform the data
train_transformed = transform_data(train_data)
dev_transformed = transform_data(dev_data)

# Create Dataset objects
train_dataset = Dataset.from_dict(train_transformed)
dev_dataset = Dataset.from_dict(dev_transformed)

# Create a DatasetDict with the Dataset objects to prepare it processing
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset
})

print(raw_datasets)

def remove_empty_answers(dataset):
    def is_valid_example(example):
        return len(example['answers']['text']) > 0

    return dataset.filter(is_valid_example)


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [10]:
# filtering out examples (question-answer pairs) where there are no answers.
# This ensures your training data focuses on questions with actual answers.

train_dataset_cleaned = remove_empty_answers(raw_datasets["train"])

# bring the train and validation sets together in a single dictionary

raw_datasets_processed = DatasetDict({
    'train': train_dataset_cleaned,
    'validation': raw_datasets["validation"]
})

raw_datasets["validation"]


Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# Overall, this code is initializing a tokenizer specifically designed
# for the "bert-base-cased" pre-trained model. This tokenizer will be i am seeing some empty stuff that is not
# used to process text data in a way that the model can understand.
# model_checkpoint = "ozgurkk/bert-base-uncased-finetuned-squad" # if you want to take on a fine-tuned model
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#model_checkpoint = "bert-base-uncased"
model_checkpoint="albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer.is_fast

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

True

In [13]:
# Takes questions and contexts as input and uses a pre-trained tokenizer to convert them into
# sequences of tokens that the model can understand.
# Using a sliding window to handle long texts, the code creates multiple training examples from a single long context,
# increasing the training data size and allowing the model to learn from different parts of the context.
# Based on the answer's starting position in the original context, it tries to predict the corresponding
# starting and ending positions of the answer within the tokenized sequence (chunks).
# This prediction is a starting point for the model to learn during training, helping it identify answer locations within text.

def preprocess_training_examples(examples, max_length=384, stride=128):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    tokenized_examples["offset_mapping"] = offset_mapping  # offset mapping
    tokenized_examples["overflow_to_sample_mapping"] = sample_mapping  # overflow to sample mapping

    return tokenized_examples

In [14]:
# Apply the preprocessing function to the training dataset
train_dataset_processed = raw_datasets_processed["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets_processed["train"].column_names
)


Map:   0%|          | 0/86821 [00:00<?, ? examples/s]

In [15]:
# preprocesses validation data for question answering in a similar way to the training function.
# It performs tokenization, handles potential splitting due to the sliding window,
# and prepares the data for evaluation by the model.

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    contexts = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        contexts.append(examples["context"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    inputs["context"] = contexts
    return inputs

In [16]:
max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)


Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [17]:
# Load the pretrained model for question answering.

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
    "albert-base-v2-finetuned-squad",
    evaluation_strategy="steps",
    eval_steps=1000,           # Evaluate every 1000 steps
    save_steps=1000,           # Save checkpoint every 1000 steps
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
    logging_steps=100,         # Log every 100 steps
)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# make sure GPU is ready
import tensorflow as tf
tf.test.gpu_device_name()

tf.config.list_physical_devices('GPU')

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset_processed,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Step,Training Loss,Validation Loss
1000,1.1301,No log
2000,1.0475,No log
3000,1.0573,No log
4000,0.9843,No log
5000,1.0293,No log
6000,0.9761,No log
7000,0.9846,No log
8000,0.9391,No log
9000,0.9018,No log
10000,0.8811,No log


TrainOutput(global_step=32946, training_loss=0.7301377189519055, metrics={'train_runtime': 13329.3495, 'train_samples_per_second': 19.773, 'train_steps_per_second': 2.472, 'total_flos': 4365288803040768.0, 'train_loss': 0.7301377189519055, 'epoch': 3.0})

In [19]:
# Define a function that generates predictions from the model.

def make_predictions(model, dataset, batch_size=16, no_answer_threshold=0.5):
    model.eval()
    model.to("cuda")  # Move the model to GPU
    predictions = []

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size)):
            batch = dataset[i:i + batch_size]

            # Prepare inputs and move them to GPU
            inputs = {
                key: torch.tensor(val).to("cuda")
                for key, val in batch.items()
                if key in ["input_ids", "attention_mask"]
            }
            outputs = model(**inputs)

            start_logits = outputs.start_logits.cpu().numpy()
            end_logits = outputs.end_logits.cpu().numpy()

            # Calculate no-answer probability
            no_answer_prob = F.softmax(torch.tensor((start_logits[:, 0] + end_logits[:, 0]) / 2), dim=-1).cpu().numpy()

            for j, example_id in enumerate(batch["example_id"]):
                start_idx = np.argmax(start_logits[j])
                end_idx = np.argmax(end_logits[j])
                no_ans_prob = no_answer_prob[j]

                # Check if the no-answer probability exceeds the threshold
                if no_ans_prob > no_answer_threshold:
                    prediction = (example_id, None, None, no_ans_prob)
                else:
                    prediction = (example_id, start_idx, end_idx, no_ans_prob)
                predictions.append(prediction)

    return predictions

def postprocess_predictions(dataset, predictions):
    # Aggregate predictions by example_id
    example_predictions = collections.defaultdict(list)
    for example, (example_id, start_idx, end_idx, no_answer_prob) in zip(dataset, predictions):
        offsets = example["offset_mapping"]
        context = example["context"]

        if start_idx is None or end_idx is None or start_idx >= len(offsets) or end_idx >= len(offsets):
            text = ""
        else:
            while start_idx < len(offsets) and offsets[start_idx] is None:
                start_idx += 1
            while end_idx >= 0 and offsets[end_idx] is None:
                end_idx -= 1

            if start_idx >= len(offsets) or end_idx < 0 or offsets[start_idx] is None or offsets[end_idx] is None:
                text = ""
            else:
                start_char = offsets[start_idx][0]
                end_char = offsets[end_idx][1]
                text = context[start_char:end_char]

        example_predictions[example_id].append({
            "text": text,
            "no_answer_prob": no_answer_prob
        })

    # Choose the best prediction per example_id (e.g., the one with the lowest no_answer_prob)
    formatted_predictions = [
        {
            "id": example_id,
            "answer": min(preds, key=lambda x: x["no_answer_prob"])["text"],
            "no_answer_probability": min(preds, key=lambda x: x["no_answer_prob"])["no_answer_prob"]
        }
        for example_id, preds in example_predictions.items()
    ]

    return formatted_predictions


In [20]:
import torch.nn.functional as F

predictions = make_predictions(model, validation_dataset)

# predictions

final_predictions = postprocess_predictions(validation_dataset, predictions)

# final_predictions


  0%|          | 0/761 [00:00<?, ?it/s]

In [25]:
# Create the no-answer probabilities dictionary
na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}

# Convert to dictionary format
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

def save_predictions(predictions, filename):
    with open(filename, 'w') as f:
        json.dump(predictions, f)

def save_na_probs(na_probs_dict, filename):
    # Convert numpy float32 to regular Python floats for JSON serialization
    modified_dict = {k: float(v) for k, v in na_probs_dict.items()}
    with open(filename, 'w') as f:
        json.dump(modified_dict, f, indent=2)

In [27]:
# Save your predictions to a file in your mounted drive
save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/pred_albert_fine_tuned_squad.json')

# Save your predictions to a file in your mounted drive
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/na_probs_albert_fine_tuned_squad.json')

# Convert to dictionary format
pred_dict = {pred['id']: pred['answer'] for pred in data}

NameError: name 'data' is not defined

In [28]:
# Paths to the evaluation script and data files
eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_fine_tuned_squad.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_fine_tuned_squad.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_fine_tuned_squad.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_albert_fine_tuned_squad'


In [29]:
# Command to run the evaluation script
command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

trainer.push_to_hub(commit_message="Training complete")

import subprocess
# Use the evaluation script
subprocess.run(command, check=True)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

events.out.tfevents.1720670604.bd990d8cf098.612.0:   0%|          | 0.00/82.6k [00:00<?, ?B/s]

CompletedProcess(args=['python', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_fine_tuned_squad.json', '-n', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_fine_tuned_squad.json', '-o', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_fine_tuned_squad.json', '-p', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_albert_fine_tuned_squad'], returncode=0)