In [1]:
!pip install datasets evaluate
!pip install accelerate -U
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
INFO: pip is looking at multiple versions of multiprocess

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Print all the available datasets
from huggingface_hub import list_datasets
print([dataset.id for dataset in list_datasets()])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Dataset

In [66]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:10]")

In [67]:
display(squad)
display(len(squad))

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

10

In [68]:
squad = squad.train_test_split(test_size=0.2)

In [69]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 2
    })
})

In [70]:
squad["train"][0]

{'id': '5733be284776f4190066117f',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'What is in front of the Notre Dame Main Building?',
 'answers': {'text': ['a copper statue of Christ'], 'answer_start': [188]}}

In [71]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [78]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    print("len(questions)           :", len(questions))
    print("questions[:3]            :", questions[:3])
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    print("type(inputs)             :", type(inputs))
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        print("i   --  offset           :", i, " -- ", offset)
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    print("start_positions          :", start_positions)
    print("end_positions            :", end_positions)
    return inputs

In [79]:
squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

len(questions)           : 8
questions[:3]            : ['What is in front of the Notre Dame Main Building?', 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?']
type(inputs)             : <class 'transformers.tokenization_utils_base.BatchEncoding'>
i   --  offset           : 0  --  [(0, 0), (0, 4), (5, 7), (8, 10), (11, 16), (17, 19), (20, 23), (24, 29), (30, 34), (35, 39), (40, 48), (48, 49), (0, 0), (0, 13), (13, 15), (15, 16), (17, 20), (21, 27), (28, 31), (32, 33), (34, 42), (43, 52), (52, 53), (54, 58), (59, 62), (63, 67), (68, 76), (76, 77), (77, 78), (79, 83), (84, 88), (89, 91), (92, 93), (94, 100), (101, 107), (108, 110), (111, 114), (115, 121), (122, 126), (126, 127), (128, 139), (140, 142), (143, 148), (149, 151), (152, 155), (156, 160), (161, 169), (170, 173), (174, 180), (181, 183), (183, 184), (185, 187), (188, 189), (190, 196), (197, 203), (204, 206), (207, 213), (214,

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

len(questions)           : 2
questions[:3]            : ['What is the Grotto at Notre Dame?', 'What is the daily student paper at Notre Dame called?']
type(inputs)             : <class 'transformers.tokenization_utils_base.BatchEncoding'>
i   --  offset           : 0  --  [(0, 0), (0, 4), (5, 7), (8, 11), (12, 14), (14, 18), (19, 21), (22, 27), (28, 32), (32, 33), (0, 0), (0, 13), (13, 15), (15, 16), (17, 20), (21, 27), (28, 31), (32, 33), (34, 42), (43, 52), (52, 53), (54, 58), (59, 62), (63, 67), (68, 76), (76, 77), (77, 78), (79, 83), (84, 88), (89, 91), (92, 93), (94, 100), (101, 107), (108, 110), (111, 114), (115, 121), (122, 126), (126, 127), (128, 139), (140, 142), (143, 148), (149, 151), (152, 155), (156, 160), (161, 169), (170, 173), (174, 180), (181, 183), (183, 184), (185, 187), (188, 189), (190, 196), (197, 203), (204, 206), (207, 213), (214, 218), (219, 223), (224, 226), (226, 229), (229, 232), (233, 237), (238, 241), (242, 248), (249, 250), (250, 252), (252, 254), (254, 2

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 8
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 2
    })
})

### Train

In [90]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [91]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [92]:
squad = load_dataset("squad", split="train[:6000]")
squad = squad.train_test_split(test_size=0.1)
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [93]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

#### distilbert-base-uncased

In [94]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [95]:
training_args = TrainingArguments(
    output_dir="./HF-QA-distilbert-base-uncased",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.769254
2,2.683000,1.425018
3,1.263900,1.406477
4,1.263900,1.427117
5,0.837600,1.469188


TrainOutput(global_step=1690, training_loss=1.4978435268063517, metrics={'train_runtime': 1107.9034, 'train_samples_per_second': 24.37, 'train_steps_per_second': 1.525, 'total_flos': 2645725275648000.0, 'train_loss': 1.4978435268063517, 'epoch': 5.0})

In [96]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

events.out.tfevents.1707043680.44011b116f8e.216.1:   0%|          | 0.00/6.32k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/rinogrego/HF-QA-distilbert-base-uncased/commit/1d01e51152590f7fcfca2ea04c9e39e6ac4ded3e', commit_message='End of training', commit_description='', oid='1d01e51152590f7fcfca2ea04c9e39e6ac4ded3e', pr_url=None, pr_revision=None, pr_num=None)

#### bert-base-uncased

In [97]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [98]:
squad = load_dataset("squad", split="train[:6000]")
squad = squad.train_test_split(test_size=0.1)
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
data_collator = DefaultDataCollator()

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [99]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [100]:
training_args = TrainingArguments(
    output_dir="./HF-QA-bert-base-uncased",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.544884
2,2.371200,1.287398
3,1.018800,1.281919
4,1.018800,1.363779
5,0.596900,1.415748


TrainOutput(global_step=1690, training_loss=1.234756587101863, metrics={'train_runtime': 2180.2349, 'train_samples_per_second': 12.384, 'train_steps_per_second': 0.775, 'total_flos': 5291259323904000.0, 'train_loss': 1.234756587101863, 'epoch': 5.0})

In [101]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

events.out.tfevents.1707044823.44011b116f8e.216.2:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/rinogrego/HF-QA-bert-base-uncased/commit/cf3e02f633e2260be9efee52ca8bef2815638f4a', commit_message='End of training', commit_description='', oid='cf3e02f633e2260be9efee52ca8bef2815638f4a', pr_url=None, pr_revision=None, pr_num=None)

### Inference

In [102]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [103]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="HF-QA-distilbert-base-uncased")
question_answerer(question=question, context=context)

{'score': 0.3968351185321808,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

### Comparison

In [105]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

qa_distilbert = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
)
qa_distilbert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
    tokenizer="distilbert-base-uncased"
)
qa_bert = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
)
qa_bert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
    tokenizer="bert-base-uncased"
)

display(qa_distilbert(question=question, context=context))
display(qa_distilbert_tokenizer(question=question, context=context))
display(qa_bert(question=question, context=context))
display(qa_bert_tokenizer(question=question, context=context))

{'score': 0.3968351185321808,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

{'score': 0.3968351185321808,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

{'score': 0.3634815216064453,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

{'score': 0.3634815216064453,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

In [109]:
unmatched_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
    tokenizer="bert-base-uncased"
)
unmatched_tokenizer(question=question, context=context)

TypeError: DistilBertForQuestionAnswering.forward() got an unexpected keyword argument 'token_type_ids'

In [110]:
unmatched_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
    tokenizer="distilbert-base-uncased"
)
unmatched_tokenizer(question=question, context=context)

{'score': 0.6742727160453796, 'start': 58, 'end': 60, 'answer': '46'}

### Bahasa Indonesia

In [111]:
question2 = "Berapa banyak bahasa pemrograman yang disupport oleh BLOOM?"
context2 = "BLOOM memiliki 176 miliar parameters dan dapat menciptakan teks dalam 46 bahasa natural dan 13 bahasa pemrograman."

In [113]:
qa_distilbert = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
)
qa_distilbert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
    tokenizer="distilbert-base-uncased"
)
qa_bert = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
)
qa_bert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
    tokenizer="bert-base-uncased"
)

display(qa_distilbert(question=question2, context=context2))
display(qa_distilbert_tokenizer(question=question2, context=context2))
display(qa_bert(question=question2, context=context2))
display(qa_bert_tokenizer(question=question2, context=context2))

{'score': 0.04805443435907364, 'start': 92, 'end': 94, 'answer': '13'}

{'score': 0.04805443435907364, 'start': 92, 'end': 94, 'answer': '13'}

{'score': 0.11119908839464188,
 'start': 0,
 'end': 36,
 'answer': 'BLOOM memiliki 176 miliar parameters'}

{'score': 0.11119908839464188,
 'start': 0,
 'end': 36,
 'answer': 'BLOOM memiliki 176 miliar parameters'}

In [118]:
question3 = "Ada berapa bahasa asli yang dapat BLOOM mengerti?"
context3 = "BLOOM memiliki 176 miliar parameters dan dapat menciptakan teks dalam 46 bahasa asli dan 13 bahasa pemrograman."

In [119]:
qa_distilbert = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
)
qa_distilbert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-distilbert-base-uncased",
    tokenizer="distilbert-base-uncased"
)
qa_bert = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
)
qa_bert_tokenizer = pipeline(
    "question-answering",
    model="HF-QA-bert-base-uncased",
    tokenizer="bert-base-uncased"
)

display(qa_distilbert(question=question3, context=context3))
display(qa_distilbert_tokenizer(question=question3, context=context3))
display(qa_bert(question=question3, context=context3))
display(qa_bert_tokenizer(question=question3, context=context3))

{'score': 0.232313334941864,
 'start': 89,
 'end': 110,
 'answer': '13 bahasa pemrograman'}

{'score': 0.232313334941864,
 'start': 89,
 'end': 110,
 'answer': '13 bahasa pemrograman'}

{'score': 0.27737724781036377,
 'start': 85,
 'end': 110,
 'answer': 'dan 13 bahasa pemrograman'}

{'score': 0.27737724781036377,
 'start': 85,
 'end': 110,
 'answer': 'dan 13 bahasa pemrograman'}

In [121]:
question4 = "berapa parameter yang dimiliki BLOOM?"
context4 = "BLOOM memiliki 176 miliar parameter dan dapat menciptakan teks dalam 46 bahasa asli dan 13 bahasa pemrograman."

In [122]:
display(qa_distilbert(question=question4, context=context4))
display(qa_bert(question=question4, context=context4))

{'score': 0.05159865319728851,
 'start': 6,
 'end': 25,
 'answer': 'memiliki 176 miliar'}

{'score': 0.2570013403892517,
 'start': 0,
 'end': 35,
 'answer': 'BLOOM memiliki 176 miliar parameter'}