In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import numpy as np
with open('/kaggle/input/stanford-question-answering-dataset/train-v1.1.json') as train_file:
    train = json.load(train_file)

with open('/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json') as dev_file:
    dev = json.load(dev_file)

In [2]:
def prepare_dataset(data):
    contexts = []
    questions = []
    answers = []

    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]  
                answer['text'] = answer['text']
                answer['answer_start'] = answer['answer_start']

                contexts.append(context)
                questions.append(question)
                answers.append(answer)
    
    return Dataset.from_dict({'context': contexts, 'question': questions, 'answers': answers})

train = prepare_dataset(train)
dev = prepare_dataset(dev)
train[0]

{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}}

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [4]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,  
        return_tensors="pt"
    )
    
    start_positions = []
    end_positions = []
    
    for i, answer in enumerate(examples['answers']):
        start_positions.append(answer['answer_start'])
        end_positions.append(answer['answer_start'] + len(answer['text']))
    
    inputs.update({
        "start_positions": start_positions,
        "end_positions": end_positions,
    })
    
    return inputs
tokenized_train_dataset = train.map(preprocess_function, batched=True)
tokenized_dev_dataset = dev.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,  
    gradient_accumulation_steps=2,
    dataloader_num_workers=2,  
    dataloader_pin_memory=True,
)



In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset)

In [8]:
def compute_iou(predictions, references):
    ious = []
    for pred, ref in zip(predictions, references):
        pred_tokens = set(range(pred['start_positions'], pred['end_positions']))
        ref_tokens = set(range(ref['start_positions'], ref['end_positions']))
        intersection = len(pred_tokens & ref_tokens)
        union = len(pred_tokens | ref_tokens)
        iou = intersection / union if union != 0 else 0
        ious.append(iou)
    return {"token_level_iou": np.mean(ious)}

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    pred_starts = predictions[0].argmax(-1)
    pred_ends = predictions[1].argmax(-1)
    label_starts = labels['start_positions']
    label_ends = labels['end_positions']

    pred = [{'start_positions': start, 'end_positions': end} for start, end in zip(pred_starts, pred_ends)]
    ref = [{'start_positions': start, 'end_positions': end} for start, end in zip(label_starts, label_ends)]

    return compute_iou(pred, ref)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss


In [None]:
trainer.save_model("/kaggle/working/")
from transformers import pipeline
qa = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [None]:
context = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
question = "Where is Hugging Face based?"

result = qa({
    'context': context,
    'question': question
})

print("Prediction:", result)

In [None]:
context = "My name is Riya Dedhia. I study in NMIMS."
question = "Where does Riya study?"

result = qa({
    'context': context,
    'question': question
})

print("Prediction:", result)