**Installing necessary packages and importing necessary libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q simpletransformers datasets sacrebleu evaluate torch accelerate tqdm
import re, torch, collections, evaluate, datasets, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler, pipeline

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 k

**Reading the dataset**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/SQuAD_csv.csv')
df.sample(1)

Unnamed: 0.1,Unnamed: 0,context,question,id,answer_start,answer
37997,148,"By August, the KPA had pushed back the ROK Arm...",How did the KPA hurt the Republic of Korea's i...,57269eab708984140094cbeb,209,by killing civil servants and intellectuals


**Checking for missing values**

In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
context,0
question,0
id,0
answer_start,0
answer,3


**1 missing value found in the answer column. Dropping it and resetting index**

In [None]:
df = df.dropna()
df = df.reset_index(drop=True)

**Looking for and removing duplicates**

In [None]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(86818, 6)
(86818, 6)


**Resetting index**

In [None]:
df = df.reset_index(drop=True)

**Creating index column**

In [None]:
df['id'] = np.arange (0,86818)

**Defining a function to find the answer start indices and applying the function to each row in the dataframe**

In [None]:
def find_answer_start(context, answer):
    match = re.search(re.escape(answer), context)
    if match is None:
        return -1
    else:
        return match.start()

df['answer_start'] = df.apply(lambda row: find_answer_start(row['context'], row['answer']), axis=1)
df.sample(1)

Unnamed: 0.1,Unnamed: 0,context,question,id,answer_start,answer
25038,27,"In 1997, OMB issued a Federal Register notice ...",In what year did OMB revise the standards for ...,25038,3,1997


**Checking the entries for which the answer was not found in the context**

In [None]:
no_answer = df[df['answer_start']== -1]
print (no_answer.shape)

(1806, 6)


**No answers were found for 9747 entries**

**Removing the entries for each the answer was not found in the context**

In [None]:
df = df[df['answer_start']!=-1]

**Converting 'answer' and 'answer_start' columns to list format as requied by the model and saving them in separate columns**

In [None]:
answer_lists = df['answer'].apply(lambda x: [x]).tolist()
answer_start_lists = df['answer_start'].apply(lambda x: [x]).tolist()

df['answer_new'] = answer_lists
df['answer_start_new'] = answer_start_lists

df.sample(1)

Unnamed: 0.1,Unnamed: 0,context,question,id,answer_start,answer,answer_new,answer_start_new
39479,231,"However, this was not always the case; in the ...",What did Barthes believe a wrestler give the a...,39479,1012,a theatrical spectacle,[a theatrical spectacle],[1012]


**Dropping the previously present 'answer' and 'answer_start' columns and renaming the columns now present**

In [None]:
df = df.drop (['answer', 'answer_start'], axis =1)
df = df.rename(columns={'answer_new':'answer','answer_start_new':'answer_start'})
df.sample(1)

Unnamed: 0.1,Unnamed: 0,context,question,id,answer,answer_start
70204,106,Vacuum is useful in a variety of processes and...,Hight to ultra-high vacuums removes what obstr...,70204,"[obstruction of air,]",[537]


**Making train, test and validation splits**

In [None]:
train_old, test = train_test_split (df, test_size =0.2, random_state =6)

train, validation = train_test_split (train_old, test_size =0.2, random_state =6)

**Converting individiual datasets to arrow format for some preprocessing and removing the newly made index column**

In [None]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
validation = Dataset.from_pandas(validation)

train = train.remove_columns (['__index_level_0__'])
test = test.remove_columns (['__index_level_0__'])
validation = validation.remove_columns (['__index_level_0__'])

**Converting the answers to appropriate dictionary format**

In [None]:
train = train.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

validation = validation.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

test = test.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

Map:   0%|          | 0/54407 [00:00<?, ? examples/s]

Map:   0%|          | 0/13602 [00:00<?, ? examples/s]

Map:   0%|          | 0/17003 [00:00<?, ? examples/s]

**Removing 'answer' and 'answer_start' columns since we no longer need them**

In [None]:
train = train.remove_columns(['answer', 'answer_start'])

test = test.remove_columns(['answer', 'answer_start'])

validation = validation.remove_columns(['answer', 'answer_start'])

train

Dataset({
    features: ['Unnamed: 0', 'context', 'question', 'id', 'answers'],
    num_rows: 54407
})

**Combining the 3 datasets in to a single dataset**

In [None]:
ds = DatasetDict()

ds['train'] = train
ds['test'] = test
ds['validation'] = validation

ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'context', 'question', 'id', 'answers'],
        num_rows: 54407
    })
    test: Dataset({
        features: ['Unnamed: 0', 'context', 'question', 'id', 'answers'],
        num_rows: 17003
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'context', 'question', 'id', 'answers'],
        num_rows: 13602
    })
})

**Initializing the model and tokenizer**

In [None]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

**Inserting special tokens in the context and question columns of train dataset**

In [None]:
context = ds["train"][0]["context"]
question = ds["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

"[CLS] The Manila Carnival was dissolved after what year? [SEP] The term Carnival is traditionally used in areas with a large Catholic presence. However, the Philippines, a predominantly Roman Catholic country, does not celebrate Carnival anymore since the dissolution of the Manila Carnival after 1939, the last carnival in the country. In historically Lutheran countries, the celebration is known as Fastelavn, and in areas with a high concentration of Anglicans and Methodists, pre - Lenten celebrations, along with penitential observances, occur on Shrove Tuesday. In Eastern Orthodox nations, Maslenitsa is celebrated during the last week before Great Lent. In German - speaking Europe and the Netherlands, the Carnival season traditionally opens on 11 / 11 ( often at 11 : 11 a. m. ). This dates back to celebrations before the Advent season or with harvest celebrations of St. Martin ' s Day. [SEP]"

**Limiting the input to 100, using a sliding window of 50 tokens and returing overfowing tokens**

In [None]:
inputs = tokenizer(question, context, max_length=100, truncation="only_second", stride=50,
                   return_overflowing_tokens=True)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] The Manila Carnival was dissolved after what year? [SEP] The term Carnival is traditionally used in areas with a large Catholic presence. However, the Philippines, a predominantly Roman Catholic country, does not celebrate Carnival anymore since the dissolution of the Manila Carnival after 1939, the last carnival in the country. In historically Lutheran countries, the celebration is known as Fastelavn, and in areas with a high concentration of Anglicans and Methodists, pre - Lenten celebrations, along with penitential o [SEP]
[CLS] The Manila Carnival was dissolved after what year? [SEP] 1939, the last carnival in the country. In historically Lutheran countries, the celebration is known as Fastelavn, and in areas with a high concentration of Anglicans and Methodists, pre - Lenten celebrations, along with penitential observances, occur on Shrove Tuesday. In Eastern Orthodox nations, Maslenitsa is celebrated during the last week before Great Lent. In German - speaking Europe [SEP]


**Finding the end character of the answer in the context by setting offset mapping = True**

In [None]:
inputs = tokenizer(question, context, max_length=100, truncation="only_second", stride=50,
                   return_overflowing_tokens=True, return_offsets_mapping=True)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

**Creating a function for preprocessing training dataset**

In [None]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
#tokenised chunk
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

**Applying the function to training dataset**

In [None]:
train_dataset = ds["train"].map(preprocess_training_examples, batched=True,
                                remove_columns=ds["train"].column_names)
len(ds["train"]), len(train_dataset)

Map:   0%|          | 0/54407 [00:00<?, ? examples/s]

(54407, 55115)

**Creating a function for preprocessing validation dataset**

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

**Applying the function to validation dataset**

In [None]:
validation_dataset = ds["validation"].map(preprocess_validation_examples, batched=True,
                                          remove_columns=ds["validation"].column_names)
len(ds["validation"]), len(validation_dataset)

Map:   0%|          | 0/13602 [00:00<?, ? examples/s]

(13602, 13772)

**Using a default model for the QA pipeline to generate some predictions on a small part of the validation set**

In [None]:
small_eval_set = ds["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained (trained_checkpoint)
eval_set = small_eval_set.map(preprocess_validation_examples, batched=True,
                              remove_columns=ds["validation"].column_names)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

**Initializing the tokenizer again**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

**Removing the columns of validation dataset that are not expected by the model and converting the predictions to numpy arrays**

In [None]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)

with torch.no_grad():
    outputs = trained_model(**batch)

start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

**Mapping each example in small validation dataset to the corresponding features in validation dataset**

In [None]:
example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

**Generating predicted answers for evaluation**

In [None]:
n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

**Initializing evaluation metric and theoretical answers**

In [None]:
metric = evaluate.load("squad")
theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

**Defining the function to compute metrics**

In [None]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

**Converting train and validation datasets to torch format and preparing train and validation dataloader**

In [None]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=15)
eval_dataloader = DataLoader(validation_set, collate_fn=default_data_collator, batch_size=1)

**Initializing the model, adam optimizer, and accelerator**

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer,
                                                                          train_dataloader, eval_dataloader)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Setting training configurations**

In [None]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,
                             num_training_steps=num_training_steps,)

**Defining model name and output directory**

In [None]:
model_name = "qa_model"
output_dir = "/content/drive/MyDrive/SQuAD_csv.csv"

**Model training**

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/11025 [00:00<?, ?it/s]

In [None]:
test['context'][141]

"In 1840, Louis Philippe I obtained permission from the British to return Napoleon's remains to France. On 15 December 1840, a state funeral was held. The hearse proceeded from the Arc de Triomphe down the Champs-Élysées, across the Place de la Concorde to the Esplanade des Invalides and then to the cupola in St Jérôme's Chapel, where it remained until the tomb designed by Louis Visconti was completed. In 1861, Napoleon's remains were entombed in a porphyry sarcophagus in the crypt under the dome at Les Invalides."

In [None]:
test['question'][141]

" In what building was the cupola where Napoleon's remains were first placed located?"

In [None]:
test['answers'][141]['text']

["St Jérôme's Chapel"]

**Taking user input and finding answers by the trained model**

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# Define the model and tokenizer
model_name = "bert-base-uncased"  # Replace with your desired model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# Assuming 'model' is your trained Hugging Face model
output_dir = "/content/drive/MyDrive/qa_model/"

# Save model weights
model.save_pretrained(output_dir)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace with your tokenizer
tokenizer.save_pretrained(output_dir)


('/content/drive/MyDrive/qa_model/tokenizer_config.json',
 '/content/drive/MyDrive/qa_model/special_tokens_map.json',
 '/content/drive/MyDrive/qa_model/vocab.txt',
 '/content/drive/MyDrive/qa_model/added_tokens.json',
 '/content/drive/MyDrive/qa_model/tokenizer.json')

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
from transformers import pipeline

# Step 3: Define the correct model directory path
model_checkpoint = "/content/drive/MyDrive/qa_model/"  # Update to your trained model directory

# Step 4: Load the trained model into the question-answering pipeline
question_answerer = pipeline("question-answering", model=model_checkpoint, tokenizer=model_checkpoint)

# Step 5: Provide inputs (context and question) and get the answer
context = input("Enter the context: ")
question = input("Enter the question: ")

try:
    # Step 6: Generate the answer
    answer = question_answerer(question=question, context=context)['answer']
    print("Answer:", answer)
except Exception as e:
    print("Error:", e)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter the context: BeyoncÃ© Giselle Knowles-Carter (/biËËˆjÉ’nseÉª/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of BeyoncÃ©'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Enter the question: What areas did Beyonce compete in when she was growing up?
Answer: Grammy Awards and featured the Billboard Hot 100 number-one sing

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
from transformers import pipeline

# Step 3: Define the correct model directory path
model_checkpoint = "/content/drive/MyDrive/qa/model"  # Update to your trained model directory

# Step 4: Load the trained model into the question-answering pipeline
question_answerer = pipeline("question-answering", model=model_checkpoint, tokenizer=model_checkpoint)

# Step 5: Provide inputs (context and question) and get the answer
context = input("Enter the context: ")
question = input("Enter the question: ")

try:
    # Step 6: Generate the answer
    answer = question_answerer(question=question, context=context)['answer']
    print("Answer:", answer)
except Exception as e:
    print("Error:", e)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter the context: BeyoncÃ© Giselle Knowles-Carter (/biËËˆjÉ’nseÉª/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of BeyoncÃ©'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Enter the question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003


In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
from transformers import pipeline

# Step 3: Define the correct model directory path
model_checkpoint = "/content/drive/MyDrive/qa/model"  # Update to your trained model directory

# Step 4: Load the trained model into the question-answering pipeline
question_answerer = pipeline("question-answering", model=model_checkpoint, tokenizer=model_checkpoint)

# Step 5: Provide inputs (context and question) and get the answer
context = input("Enter the context: ")
question = input("Enter the question: ")

try:
    # Step 6: Generate the answer
    answer = question_answerer(question=question, context=context)['answer']
    print("Answer:", answer)
except Exception as e:
    print("Error:", e)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter the context: BeyoncÃ© Giselle Knowles-Carter (/biËËˆjÉ’nseÉª/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of BeyoncÃ©'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Enter the question: When did Beyonce start becoming popular?
Answer: late 1990s
