In [1]:

!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m30.7/42.4 kB[0m [31m645.2 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m578.1 kB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting wandb>=0.

In [4]:
# Install Kaggle
!pip install kaggle

# Make a directory for Kaggle and move the kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Set permissions for the Kaggle API token
!chmod 600 ~/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [5]:
# Download the Stanford Question Answering Dataset from Kaggle
!kaggle datasets download -d stanfordu/stanford-question-answering-dataset

# Unzip the dataset
!unzip stanford-question-answering-dataset.zip


Dataset URL: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset
License(s): CC-BY-SA-4.0
Downloading stanford-question-answering-dataset.zip to /content
 80% 7.00M/8.73M [00:00<00:00, 73.3MB/s]
100% 8.73M/8.73M [00:00<00:00, 86.7MB/s]
Archive:  stanford-question-answering-dataset.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


In [2]:
import json
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, BertTokenizer
from datasets import Dataset

In [6]:
import json

# Load the training dataset
with open("train-v1.1.json", "r") as read_file:
    train = json.load(read_file)

# Check the structure of the dataset
print(train.keys())


dict_keys(['data', 'version'])


In [7]:
with open("dev-v1.1.json", "r") as read_file:
    test = json.load(read_file)

# Check the structure of the test dataset
print(test.keys())

dict_keys(['data', 'version'])


In [8]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
config = {"max_length": 384}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [9]:
# Preprocess function
def preprocess_function(question, context, answer_start_char, answer_end_char):
    inputs = tokenizer(
        question,
        context,
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset = inputs.pop("offset_mapping")
    sequence_ids = inputs.sequence_ids()

    # Finding the start and end positions of the context in the tokenized input
    context_start = sequence_ids.index(1)
    context_end = len(sequence_ids) - sequence_ids[::-1].index(1)

    context_offsets = offset[context_start:context_end]

    # Create a mapping of character index to token index
    charcter_pos_to_token_pos = {}
    for token_pos, (char_start, char_end) in enumerate(context_offsets):
        for i in range(char_start, char_end):
            charcter_pos_to_token_pos[i] = token_pos + context_start

    start_pos = charcter_pos_to_token_pos.get(answer_start_char, 0)
    end_pos = charcter_pos_to_token_pos.get(
        answer_end_char - 1,
        0 if start_pos == 0 else context_end - 1
    )

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos

    return inputs

In [10]:
# Create dataset from JSON data
def create_dataset(data):
    input_ids_list = []
    attention_mask_list = []
    start_positions_list = []
    end_positions_list = []

    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_start_char = qa['answers'][0]['answer_start']
                answer_text = qa['answers'][0]['text']
                answer_end_char = answer_start_char + len(answer_text)

                inputs = preprocess_function(question, context, answer_start_char, answer_end_char)

                input_ids_list.append(inputs["input_ids"])
                attention_mask_list.append(inputs["attention_mask"])
                start_positions_list.append(inputs["start_positions"])
                end_positions_list.append(inputs["end_positions"])

    return Dataset.from_dict({
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "start_positions": start_positions_list,
        "end_positions": end_positions_list,
    })

In [11]:
train_dataset = create_dataset(train)
eval_dataset = create_dataset(test)

In [12]:
# Define token-level IoU custom metric
def token_level_iou(pred_start, pred_end, true_start, true_end):
    pred_range = set(range(pred_start, pred_end + 1))
    true_range = set(range(true_start, true_end + 1))

    intersection = len(pred_range & true_range)
    union = len(pred_range | true_range)

    return intersection / union if union != 0 else 0

In [13]:
train_dataset = train_dataset.select(range(1000))  # Train on a subset of 1000 examples
eval_dataset = eval_dataset.select(range(200))

In [14]:
# Model initialization
model = BertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bi

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,4.371204
2,No log,4.205629
3,No log,4.313974


TrainOutput(global_step=375, training_loss=4.5620712890625, metrics={'train_runtime': 16507.6939, 'train_samples_per_second': 0.182, 'train_steps_per_second': 0.023, 'total_flos': 587917702656000.0, 'train_loss': 4.5620712890625, 'epoch': 3.0})

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Inference pipeline function
def inference_pipeline(question, context):
    # Encode the inputs
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs['input_ids']
    token_type_ids = inputs['token_type_ids']

    # Get the model's predictions
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=token_type_ids)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # Debugging: Print the scores and input IDs
    print("Input IDs:", input_ids)
    print("Start Scores:", start_scores)
    print("End Scores:", end_scores)

    # Find the tokens with the highest start and end scores
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1

    # Debugging: Print the start and end positions
    print("Answer Start:", answer_start.item())
    print("Answer End:", answer_end.item())

    # Decode the tokens back to the answer text
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end])
    )

    return answer

In [19]:
# Example usage
question = "is pineapple on pizza valid?"
context = "pineapple on pizza is not valid"
predicted_answer = inference_pipeline(question, context)
print(f"Predicted Answer: {predicted_answer}")

Input IDs: tensor([[  101,  2003,  7222, 23804,  2006, 10733,  9398,  1029,   102,  7222,
         23804,  2006, 10733,  2003,  2025,  9398,   102]])
Start Scores: tensor([[-0.4286, -0.2445,  0.0743,  0.2734,  0.2520,  0.5112,  0.4023,  0.2635,
          0.1748,  0.3832,  0.3307,  0.3569,  0.3961,  0.5658,  0.3751,  0.2475,
          0.2169]])
End Scores: tensor([[ 0.1831, -0.2110,  0.3067,  0.0387,  0.2324, -0.0229,  0.1233, -0.2044,
         -0.3808,  0.2892,  0.1006,  0.3261, -0.1435, -0.0938,  0.1049,  0.3666,
         -0.4063]])
Answer Start: 13
Answer End: 16
Predicted Answer: is not valid
