In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

# dataset = load_dataset('')['train'].train_test_split(test_size=0.2)

In [4]:
dataset["train"][0]

{'Story': "CHAPTER XXIV \n\nEnid, my early and my only love, I thought, but that your father came between, In former days you saw me favourably, And if it were so, do not keep it back, Make me a little happier, let me know it.--TENNYSON \n\nThe foreign tour proved a great success. The summer in the Alps was delightful. The complete change gave Bertha new life, bodily strength first returning, and then mental activity. The glacier system was a happy exchange for her _ego_, and she observed and enjoyed with all the force of her acute intelligence and spirit of inquiry, while Phoebe was happy in doing her duty by profiting by all opportunities of observation, in taking care of Maria and listening to Mervyn, and Miss Charlecote enjoyed scenery, poetry, art, and natural objects with relish keener than even that of her young friends, who were less impressible to beauty in every shape. \n\nMervyn behaved very well to her, knowing himself bound to make the journey agreeable to her; he was cons

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

In [6]:
def preprocess_function(examples):
    stories = [story.strip() for story in examples["Story"]]
    questions = [question.strip() for question in examples["Question"]]
    span_starts = examples["span_start"]
    span_ends = examples["span_end"]
    answers = [answer.strip() for answer in examples["Answer"]]

    inputs = tokenizer(
        questions,
        stories,
        max_length=512,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []
    extracted_answers = []

    for i, offset in enumerate(offset_mapping):
        span_start = span_starts[i]
        span_end = span_ends[i]
        answer = answers[i]

        # Find the start and end of the context
        context_start = next((idx for idx, (start, end) in enumerate(offset) if start == 0), None)
        context_end = next((idx for idx, (start, end) in enumerate(offset) if end == 0), None)

        if span_start is not None and span_end is not None and context_start is not None and context_end is not None:
            # If the answer is fully inside the context, label it (0, 0)
            if span_start < offset[context_start][0] or span_end > offset[context_end][1]:
                start_positions.append(0)
                end_positions.append(0)
                extracted_answers.append("")  # Store an empty string for evaluation
            else:
                # Otherwise, it's the start and end token positions
                while context_start < len(offset) and offset[context_start][0] != 0:
                    context_start += 1

                start_positions.append(context_start)

                context_end -= 1  # Adjust for the padding token
                while context_end >= 0 and offset[context_end][1] != 0:
                    context_end -= 1

                end_positions.append(context_end)

                # Extract the answer text from the 'Story' using the positions
                extracted_answer = stories[i][offset[start_positions[-1]][0]:offset[end_positions[-1]][1]].strip()
                extracted_answers.append(extracted_answer)
        else:
            # If span_start, span_end, context_start, or context_end is None, mark as (0, 0)
            start_positions.append(0)
            end_positions.append(0)
            extracted_answers.append("")

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["extracted_answers"] = extracted_answers

    return inputs

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/53288 [00:00<?, ? examples/s]

Map:   0%|          | 0/13323 [00:00<?, ? examples/s]

In [8]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [9]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

In [10]:
import intel_extension_for_pytorch as ipex

optimizer = optim.Adam(model.parameters(), lr=0.001)

model, optimizer = ipex.optimize(model, optimizer=optimizer)



In [None]:
training_args = TrainingArguments(
    output_dir="qa_finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=55,
    per_device_eval_batch_size=55,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [8]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import intel_extension_for_pytorch as ipex
import torch

# # Load your custom dataset
# dataset = load_dataset("DATASET")
# dataset = load_dataset('DATASET')['train'].train_test_split(train_size=10, test_size=10)

# Access the test split of your dataset
eval_dataset = dataset["test"]

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

model = ipex.optimize(model)

# Initialize variables to keep track of correct and total predictions
correct_predictions = 0
total_predictions = 0

# Iterate through the evaluation dataset
for idx, example in enumerate(eval_dataset):
    question = example["Question"]
    context = example["Story"]
    answer = example["Answer"]

    # Tokenize the input text
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

    # Use the model to predict the answer
    outputs = model(**inputs)

    # Extract the predicted answer
    start_logits, end_logits = outputs.start_logits, outputs.end_logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))

    # Check if the predicted answer matches the ground truth answer
    if predicted_answer == answer:
        correct_predictions += 1

    total_predictions += 1

    # Print the progress indicator
    print(f"Evaluating {idx + 1}/{len(eval_dataset)}", end="\r")

# Calculate the accuracy
accuracy = correct_predictions / total_predictions

# Print the final accuracy
print(f"\nAccuracy on the test dataset: {accuracy * 100:.2f}%")

Evaluating 10/10
Accuracy on the test dataset: 20.00%
