<a href="https://colab.research.google.com/github/suji2804/ai-engineer-roadmap/blob/main/Q%26A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast
import json # Import the json module

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def flatten_json(data):
    """Flattens the nested JSON structure into a list of dictionaries."""
    flattened_data = []
    for item in data.get("data", []):
        for paragraph in item.get("paragraphs", []):
            context = paragraph.get("context", "")
            for qa in paragraph.get("qas", []):
                flattened_data.append({
                    "id": qa.get("id", ""),
                    "question": qa.get("question", ""),
                    "context": context,
                    "answers": qa.get("answers", [])
                })
    return flattened_data

# Load the JSON data using pandas and flatten it
with open("custom_train.json", "r") as f:
    train_json_data = json.load(f)

with open("custom_val.json", "r") as f:
    val_json_data = json.load(f)

train_flattened = flatten_json(train_json_data)
val_flattened = flatten_json(val_json_data)

# Convert flattened data to pandas DataFrames
train_df_flattened = pd.DataFrame(train_flattened)
val_df_flattened = pd.DataFrame(val_flattened)

# Convert flattened pandas DataFrames to Dataset objects
train_dataset_flattened = Dataset.from_pandas(train_df_flattened)
val_dataset_flattened = Dataset.from_pandas(val_df_flattened)

print("Flattened Train Dataset:")
print(train_dataset_flattened)
print("\nFlattened Validation Dataset:")
print(val_dataset_flattened)

def tokenize_function(examples):
    """Tokenizes question and context pairs."""
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        padding="max_length",
        return_offsets_mapping=True,
    )

# Apply the tokenize function to the flattened datasets using map
tokenized_train_dataset = train_dataset_flattened.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset_flattened.map(tokenize_function, batched=True)

print("\nTokenized Flattened Train Dataset:")
print(tokenized_train_dataset)
print("\nTokenized Flattened Validation Dataset:")
print(tokenized_val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Flattened Train Dataset:
Dataset({
    features: ['id', 'question', 'context', 'answers'],
    num_rows: 1
})

Flattened Validation Dataset:
Dataset({
    features: ['id', 'question', 'context', 'answers'],
    num_rows: 1
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


Tokenized Flattened Train Dataset:
Dataset({
    features: ['id', 'question', 'context', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 1
})

Tokenized Flattened Validation Dataset:
Dataset({
    features: ['id', 'question', 'context', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 1
})


In [3]:
# Load Model and Set Up Trainer
from transformers import BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(
    output_dir="./bert-qa-results",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i][0]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not within the context, label it as (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply the preprocess function to the flattened datasets using map
tokenized_train_dataset = train_dataset_flattened.map(preprocess_function, batched=True, remove_columns=train_dataset_flattened.column_names)
tokenized_val_dataset = val_dataset_flattened.map(preprocess_function, batched=True, remove_columns=val_dataset_flattened.column_names)

# Set the format to torch for training, including the new label columns
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])


print("\nTokenized Train Dataset with Labels:")
print(tokenized_train_dataset)
print("\nTokenized Validation Dataset with Labels:")
print(tokenized_val_dataset)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


Tokenized Train Dataset with Labels:
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1
})

Tokenized Validation Dataset with Labels:
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1
})


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=default_data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,6.208492
2,No log,6.205532
3,No log,6.199537


TrainOutput(global_step=3, training_loss=6.193918863932292, metrics={'train_runtime': 93.8407, 'train_samples_per_second': 0.032, 'train_steps_per_second': 0.032, 'total_flos': 587917702656.0, 'train_loss': 6.193918863932292, 'epoch': 3.0})

In [8]:
#Run Inference
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline

# Load the model and tokenizer from the local directory
model_path = "./bert-qa-results"
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Create the question answering pipeline with the local model and tokenizer
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example 1: The capital of France
context1 = "The capital of France is Paris."
question1 = "What is the capital of France?"
result1 = question_answerer(question=question1, context=context1)
print(f"Example 1:\nQuestion: {question1}\nContext: {context1}\nAnswer: {result1['answer']}\n")

# Example 2: Information about the Eiffel Tower
context2 = "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It was named after the engineer Gustave Eiffel, whose company designed and built the tower."
question2 = "Who designed the Eiffel Tower?"
result2 = question_answerer(question=question2, context=context2)
print(f"Example 2:\nQuestion: {question2}\nContext: {context2}\nAnswer: {result2['answer']}\n")

# Example 3: A simple fact
context3 = "The Earth is the third planet from the Sun and the only astronomical object known to harbor life."
question3 = "What is the third planet from the Sun?"
result3 = question_answerer(question=question3, context=context3)
print(f"Example 3:\nQuestion: {question3}\nContext: {context3}\nAnswer: {result3['answer']}\n")

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './bert-qa-results'.