In [27]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import json

file_path = "/content/drive/MyDrive/HnM_BerT/qa_pairs.json"

with open(file_path, "r") as file:
    json_string = file.read()
    full_qa_data = json.loads(json_string)
    
# Calculate the number of elements to keep
tenp_percent = 0.03
num_elements = int(len(full_qa_data) * tenp_percent)

# Keep only the first num_elements of the data
qa_data = full_qa_data[:num_elements]


In [29]:
!pip install torch


!pip install datasets


!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:


import json
import torch
from datasets import Dataset
from transformers import BertForQuestionAnswering, BertTokenizerFast, TrainingArguments, Trainer


In [31]:


# Convert 'answer' field to a string
def convert_answers_to_str(data):
    for item in data:
        if not isinstance(item['answer'], str):
            item['answer'] = str(item['answer'])
    return data


In [32]:



str_qa_data = convert_answers_to_str(qa_data)



In [33]:

# Convert the JSON data to a Hugging Face Dataset
dataset = Dataset.from_dict({k: [d[k] for d in str_qa_data] for k in str_qa_data[0].keys()})
train_dataset, val_dataset = dataset.train_test_split(test_size=0.1).values()


In [34]:

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384
    )



In [35]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [36]:


# Set the start and end token positions for the answers in the context
def add_token_positions(batch):
    start_positions, end_positions = [], []
    for i, answer in enumerate(batch["answer"]):
        start_idx = batch["context"][i].find(answer)
        end_idx = start_idx + len(answer)

        start_positions.append(batch["input_ids"][i].index(tokenizer.encode(answer, add_special_tokens=False)[0]))
        end_positions.append(batch["input_ids"][i].index(tokenizer.encode(answer, add_special_tokens=False)[-1]))

    batch["start_positions"] = start_positions
    batch["end_positions"] = end_positions
    return batch


In [37]:



train_dataset = train_dataset.map(add_token_positions, batched=True)
val_dataset = val_dataset.map(add_token_positions, batched=True)



Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [38]:
pip install --upgrade transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=2,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
)

In [40]:
from transformers import AutoModelForQuestionAnswering


from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased')


Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on

In [41]:
from transformers import default_data_collator, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Define a default data collator
data_collator = default_data_collator

In [42]:
# Define the Trainer object
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    data_collator=data_collator
)

In [43]:
# Train the model
trainer.train()

Step,Training Loss
10,5.9627
20,5.9306
30,5.8583
40,5.733
50,5.4542


TrainOutput(global_step=52, training_loss=5.7655631212087775, metrics={'train_runtime': 1736.1862, 'train_samples_per_second': 0.465, 'train_steps_per_second': 0.03, 'total_flos': 79175778619392.0, 'train_loss': 5.7655631212087775, 'epoch': 2.0})

In [54]:
from transformers import Trainer, TrainingArguments


In [55]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    eval_dataset=val_dataset,
)


In [56]:
eval_results = trainer.evaluate()


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [63]:
import os

model_save_directory = "/content/drive/MyDrive/HnM_BerT/Bert_Model"
tokenizer_save_directory = "/content/drive/MyDrive/HnM_BerT/Bert_Tokenizer"

os.makedirs(model_save_directory, exist_ok=True)
os.makedirs(tokenizer_save_directory, exist_ok=True)

model.save_pretrained(model_save_directory)
tokenizer.save_pretrained(tokenizer_save_directory)


('/content/drive/MyDrive/HnM_BerT/Bert_Tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/HnM_BerT/Bert_Tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/HnM_BerT/Bert_Tokenizer/vocab.txt',
 '/content/drive/MyDrive/HnM_BerT/Bert_Tokenizer/added_tokens.json')