# Model 4 - DistilBERT

In [None]:
import numpy as np

import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import pipeline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
!python --version

Python 3.9.16


In [None]:
tf. __version__

'2.12.0'

# 1. Load data

In [None]:
# Load and split into training and validation datasets
dataset = tfds.load('squad/v1.1')
train_ds = dataset['train']
val_ds = dataset['validation']

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/squad/v1.1/3.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]



Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v1.1/3.0.0.incomplete2VPJSU/squad-train.tfrecord*...:   0%|         …

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v1.1/3.0.0.incomplete2VPJSU/squad-validation.tfrecord*...:   0%|    …

Dataset squad downloaded and prepared to /root/tensorflow_datasets/squad/v1.1/3.0.0. Subsequent calls will reuse this data.


In [None]:
# Extract relevant data from training and validation datasets
def extract_data(instance):
    paragraph = instance['context']
    question = instance['question']
    answer = instance['answers']['text'][0]
    answer_start = instance['answers']['answer_start'][0]
    return paragraph, question, answer, answer_start

train_ds = pd.DataFrame(train_ds.map(extract_data).as_numpy_iterator())
val_ds = pd.DataFrame(val_ds.map(extract_data).as_numpy_iterator())

headers = ['Paragraph', 'Question', 'Answer', 'Answer Start']
train_ds.columns = headers
val_ds.columns = headers

In [None]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer,Answer Start
0,b'The difference in the above factors for the ...,b'What is one use that would require an antenn...,b'mobile phones',427
1,"b""The coronation of Charlemagne as emperor on ...",b'About how many counts existed in the Carolin...,b'300',1020
2,b'Plant responses to climate and other environ...,b'How can climate changes be determined from s...,b'fossil pollen deposits in sediments',339
3,b'The Tucson metro area is served by many loca...,"b""What is Tucson's Fox station?""",b'KMSB-TV 11',347
4,"b""Situated on one of the world's largest natur...",b'What is the size of New York City in square ...,b'305',367


# 2. Get answer's end character position

In [None]:
# Get start and end character position of answer in paragraph
def get_answer_char_pos(row):
    paragraph, answer, answer_start = row['Paragraph'], row['Answer'], row['Answer Start']
    answer_end = answer_start + len(answer)

    # Check if SQuAD answers are off by one or two characters
    if paragraph[answer_start-1:answer_end-1] == answer:
        return [answer_start-1, answer_end-1]
    elif paragraph[answer_start-2:answer_end-2] == answer:
        return [answer_start-2, answer_end-2]
    else:
        return [answer_start, answer_end]

train_ds['Answer'] = train_ds.apply(get_answer_char_pos, axis=1)
train_ds = train_ds.drop('Answer Start', axis=1)

val_ds['Answer'] = val_ds.apply(get_answer_char_pos, axis=1)
val_ds = val_ds.drop('Answer Start', axis=1)

In [None]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer
0,b'The difference in the above factors for the ...,b'What is one use that would require an antenn...,"[427, 440]"
1,"b""The coronation of Charlemagne as emperor on ...",b'About how many counts existed in the Carolin...,"[1020, 1023]"
2,b'Plant responses to climate and other environ...,b'How can climate changes be determined from s...,"[339, 374]"
3,b'The Tucson metro area is served by many loca...,"b""What is Tucson's Fox station?""","[347, 357]"
4,"b""Situated on one of the world's largest natur...",b'What is the size of New York City in square ...,"[367, 370]"


# 3. Run DistilBERT model

In [None]:
paragraph_train, question_train = train_ds['Paragraph'].tolist(), train_ds['Question'].tolist()
paragraph_train = [text.decode('utf-8') for text in paragraph_train]
question_train = [text.decode('utf-8') for text in question_train]
train_labels = train_ds['Answer'].tolist()

paragraph_val, question_val = val_ds['Paragraph'].tolist(), val_ds['Question'].tolist()
paragraph_val = [text.decode('utf-8') for text in paragraph_val]
question_val = [text.decode('utf-8') for text in question_val]
val_labels = val_ds['Answer'].tolist()

In [None]:
model = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')

train_predictions =[]
for paragraph, question in zip(paragraph_train, question_train):
    prediction = model(question=question, context=paragraph)
    start_pred, end_pred = prediction['start'], prediction['end']
    train_predictions.append([start_pred, end_pred])

val_predictions = []
for paragraph, question in zip(paragraph_val, question_val):
    prediction = model(question=question, context=paragraph)
    start_pred, end_pred = prediction['start'], prediction['end']
    val_predictions.append([start_pred, end_pred])

Downloading (…)lve/main/config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# 4. Evaluate DistilBERT model

In [None]:
def exact_match(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    if start_pred != start_true or end_pred != end_true:
        return float(0)
    else:
        return float(1)

def f1_score(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    predicted_tokens = set(range(start_pred, end_pred + 1))
    true_tokens = set(range(start_true, end_true + 1))
    common_tokens = predicted_tokens.intersection(true_tokens)
    precision = len(common_tokens) / len(predicted_tokens) if len(predicted_tokens) > 0 else 0
    recall = len(common_tokens) / len(true_tokens) if len(true_tokens) > 0 else 0
    return float(2 * precision * recall / (precision + recall) if precision + recall > 0 else 0)

In [None]:
exact_match_res = 0
f1_score_res = 0
count = 0

for y_true, y_pred in zip(train_labels, train_predictions):
    exact_match_res += exact_match(y_true, y_pred)
    f1_score_res += f1_score(y_true, y_pred)
    count += 1

exact_match_res /= count
f1_score_res /= count

print('Exact match:', exact_match_res)
print('F1 score:', f1_score_res)

Exact match: 0.6869370654916152
F1 score: 0.8765599626472513


In [None]:
exact_match_res = 0
f1_score_res = 0
count = 0

for y_true, y_pred in zip(val_labels, val_predictions):
    exact_match_res += exact_match(y_true, y_pred)
    f1_score_res += f1_score(y_true, y_pred)
    count += 1

exact_match_res /= count
f1_score_res /= count

print('Exact match:', exact_match_res)
print('F1 score:', f1_score_res)

Exact match: 0.5698202459791863
F1 score: 0.7725855536405953


In [None]:
with open('model4.npy', 'wb') as f:
    np.save(f, np.array(val_predictions))