In [1]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import torch

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

In [3]:
dataset = load_dataset("rajpurkar/squad", split={'train': 'train[:2%]', 'validation': 'validation[:2%]'})
print(dataset['train'][0])
print(dataset['validation'][0])

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}
{'id': '56be4db0acb8001400a502ec', 'title': 'Super_Bow

In [4]:
model_name = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def train_data(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=128,
        stride=32,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        answer_start = answers["answer_start"][0]
        answer_end = answer_start + len(answers["text"][0])

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= answer_start and offsets[token_end_index][1] >= answer_end):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)

            while offsets[token_end_index][1] >= answer_end:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

train_dataset = dataset["train"].map(train_data, batched=True, remove_columns=dataset["train"].column_names)
validation_dataset = dataset["validation"].map(train_data, batched=True, remove_columns=dataset["validation"].column_names)


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    save_steps=500,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: sanjayrampalakkat (sanjayrampalakkat-christ-university). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777609622, max=1.0…

  0%|          | 0/777 [00:00<?, ?it/s]

{'loss': 1.1772, 'grad_norm': 9.693568229675293, 'learning_rate': 1.7824967824967826e-05, 'epoch': 1.93}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.4234824180603027, 'eval_runtime': 28.1069, 'eval_samples_per_second': 11.385, 'eval_steps_per_second': 0.178, 'epoch': 1.93}
{'train_runtime': 3158.4737, 'train_samples_per_second': 3.932, 'train_steps_per_second': 0.246, 'train_loss': 0.8820332020252675, 'epoch': 3.0}


TrainOutput(global_step=777, training_loss=0.8820332020252675, metrics={'train_runtime': 3158.4737, 'train_samples_per_second': 3.932, 'train_steps_per_second': 0.246, 'total_flos': 811326429665280.0, 'train_loss': 0.8820332020252675, 'epoch': 3.0})

In [12]:
def ask_question(question, context, model, tokenizer):
    # Check if CUDA is available, otherwise use CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

context = '''The Great Wall of China, built between the 5th century BC and the 16th century, is a series of fortifications made
             of stone, brick, tamped earth, wood, and other materials. It was constructed to protect the Chinese states and
             empires against the raids and invasions of the various nomadic groups of the Eurasian Steppe.'''


# Load your model and tokenizer here (ensure they are correctly initialized)
# model = ...
# tokenizer = ...
question=input('Enter your question')
answer = ask_question(question, context, model, tokenizer)
print(answer)

china
