NLP — Building a Question Answering model

In [3]:
# ! pip install transformers
# ! pip install datasets 
# !pip install git+https://github.com/huggingface/transformers.git

In [4]:
import transformers # importing the required libraries
from transformers import AutoTokenizer
import torch
import pandas as pd

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [38]:
from datasets import load_dataset # loading the dataset directly from hugging face datasets
train = load_dataset("squad",split = "train[:50%]")
val = load_dataset("squad", split = "validation[:50%]")




In [39]:
train # train data

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 43800
})

In [40]:
val # test data

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5285
})

In [41]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Data Preprocessing

In [42]:
def preprocess_function(examples): 
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [43]:
tokenized_squad_train = train.map(preprocess_function, batched=True)
tokenized_squad_val = val.map(preprocess_function, batched=True)

  0%|          | 0/44 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [44]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors ='tf')

prearing Train and Validation 

In [45]:
tf_train_set = tokenized_squad_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "start_positions", "end_positions"],
    # dummy_labels=True,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = tokenized_squad_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "start_positions", "end_positions"],
    # dummy_labels=True,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [46]:
from transformers import TFAutoModelForQuestionAnswering 
model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint) # loading the pretrained Model 

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs', 'dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
from tensorflow import keras
optimizer = keras.optimizers.Adam(learning_rate=2e-5)

In [48]:
import tensorflow as tf
keras.mixed_precision.set_global_policy("mixed_float16")

model.compile(optimizer=optimizer, metrics = ["accuracy"])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [49]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5) # training the model

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6dc95fce10>

Testing the model

In [82]:
def question_answer_model(text,question): # creating a function that takes text and question and returns answer
    inputs = tokenizer([text], [question], return_tensors="np")
    outputs = model(inputs)
    start_position = tf.argmax(outputs.start_logits, axis=1)
    end_position = tf.argmax(outputs.end_logits, axis=1)
    # print(int(start_position), int(end_position[0]))
    answer = inputs["input_ids"][0, int(start_position) : int(end_position) + 1]
    return tokenizer.decode(answer)



In [99]:
context = """ The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.  """
question = "Why is model conversion important?"


In [100]:
question_answer_model(context,question) # sample 1

'let people easily switch between frameworks'

In [87]:
question_answer_model("john lives in hyderabad, he is a good teacher","who is john?")  # sample 2

'teacher'

In [91]:
question_answer_model("My name is Sarah and I live in London","where do i live?")  # sample 3

'london'

In [94]:
question_answer_model("My name is Sarah and I live in London","who am i?")   # sample 4

'sarah'

In [97]:
question_answer_model("RCB is the best cricket team in IPL, this team won the cup in year 2022","in which year RCB won the cup?")  # sample 5

'2022'