In [1]:
! pip install datasets
! pip install transformers
! pip install evaluate

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [2]:
import datasets
from transformers import AutoTokenizer, DefaultDataCollator, create_optimizer, TFAutoModelForQuestionAnswering
import tensorflow as tf
from transformers.keras_callbacks import PushToHubCallback
from transformers import pipeline
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
squad_dataset = datasets.load_dataset("squad", split="train[:3600]")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
# newsqa_dataset =  datasets.load_dataset("lucadiliello/newsqa")

In [6]:
squad_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 3600
})

In [7]:
squad_dataset = squad_dataset.train_test_split(test_size=0.2)

In [8]:
squad_dataset["train"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2880
})

In [9]:
squad_dataset["train"]["question"]

['What was the average vertical slip in the Beichaun Fault?',
 'In what year did the opening of a theology library at Notre Dame occur?',
 'In the 1990s, how many people were indicted for war crimes that were officially defined as genocide?',
 "Who's death caused this protest?",
 'In what era was Frédéric active in?',
 'Gurian created what in 1939 at Notre Dame?',
 'In what year did Nintendo reveal the start of development on what would become Twilight Princess?',
 'Who ended up with the 200 letters from Sand to Chopin?',
 'Who did Frédéric visit Berlin with in September 1828?',
 'Who wrote the book Khubilai Khan?',
 'iPods released before what year had issues with distorted bass?',
 'What soft drink company has Beyoncé worked with since 2002?',
 'What album caused a lawsuit to be filed in 2001?',
 'What is its rank in popularion?',
 "Where did Twilight Princess rank on Nintendo Power's list of Nintendo games in the 2000s?",
 'For what movie did Beyonce receive  her first Golden Globe 

In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def preproc(example_set):
  qs = [q.strip() for q in example_set["question"]]
  inputs = tokenizer(
      qs,
      example_set["context"],
      max_length = 300,
      truncation="only_second",
      return_offsets_mapping=True,
      padding="max_length",
  )


  offset_map = inputs.pop("offset_mapping")
  ans = example_set["answers"]
  start_posns = []
  end_posns = []

  for i,offset in enumerate(offset_map):
    an = ans[i]
    start_char = an["answer_start"][0]
    end_char = an["answer_start"][0] + len(an["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # finding the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
      start_posns.append(0)
      end_posns.append(0)
    else:
      # Otherwise it's the start and end token positions
      idx = context_start
      while idx <= context_end and offset[idx][0] <= start_char:
        idx += 1
      start_posns.append(idx - 1)

      idx = context_end
      while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
      end_posns.append(idx + 1)

  inputs["start_positions"] = start_posns
  inputs["end_positions"] = end_posns
  return inputs

In [12]:
tokenized_squad = squad_dataset.map(preproc, batched=True, remove_columns=squad_dataset["train"].column_names)

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

In [13]:
data_collator = DefaultDataCollator(return_tensors="tf")

In [14]:
batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [15]:
model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [16]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

In [17]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 2880
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 720
    })
})

In [18]:
tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [19]:
model.compile(optimizer=optimizer)

In [20]:
callback = PushToHubCallback(
    output_dir="raajan_qa",
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/raajanwankhade/raajan_qa into local empty directory.


Download file tf_model.h5:   0%|          | 1.45k/253M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/253M [00:00<?, ?B/s]

In [21]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=[callback])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7dc9b2701390>

In [34]:
question = "Who was the first one to walk out?"
context = "Virat Kohli was the second one to walk out after Rohit Sharma."

In [35]:
answerer = pipeline("question-answering", model="raajan_qa")
answerer(question=question, context=context)

Some layers from the model checkpoint at raajan_qa were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at raajan_qa and are newly initialized: ['dropout_159']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.3405703008174896, 'start': 49, 'end': 61, 'answer': 'Rohit Sharma'}