In [1]:
import pandas as pd

# Load the CSV file
file_path = 'generated_data.csv'  # Change this to the path of your CSV file
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())


                                             context  \
0  The heart is a muscular organ in humans that p...   
1  The human brain is responsible for controlling...   
2  Water is essential for life and makes up about...   
3  Diabetes is a disease that occurs when your bl...   
4          The lungs occurs when high blood glucose.   

                                       question    answer  
0                       What organ pumps blood?     heart  
1                         What controls memory?     brain  
2                   What is essential for life?     water  
3  What disease occurs with high blood glucose?  diabetes  
4    What lungs occurs when high blood glucose?     lungs  


In [2]:
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocessing function for tokenization
def preprocess_data(examples):
    # Tokenize context and question
    tokenized_input = tokenizer(examples['question'], examples['context'], truncation=True, padding="max_length", max_length=512)
    
    # Add start and end positions for question answering (use answer span)
    # We find the start and end positions of the answer in the context
    answer_start = examples['context'].find(examples['answer'])
    answer_end = answer_start + len(examples['answer'])

    tokenized_input['start_positions'] = answer_start
    tokenized_input['end_positions'] = answer_end

    return tokenized_input

# Apply tokenization to each row
df = df.apply(preprocess_data, axis=1)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch.utils.data import Dataset

# Custom Dataset class for Question Answering
class QADataset(Dataset):
    def __init__(self, df):
        self.data = df
        self.input_ids = [item['input_ids'] for item in df]
        self.attention_mask = [item['attention_mask'] for item in df]
        self.start_positions = [item['start_positions'] for item in df]
        self.end_positions = [item['end_positions'] for item in df]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'start_positions': torch.tensor(self.start_positions[idx]),
            'end_positions': torch.tensor(self.end_positions[idx])
        }

# Create Dataset
qa_dataset = QADataset(df)

# Check the dataset
print(qa_dataset[0])


{'input_ids': tensor([  101,  2054,  5812, 15856,  2668,  1029,   102,  1996,  2540,  2003,
         1037, 13472,  5812,  1999,  4286,  2008, 15856,  2668,  2083,  1996,
        25022, 11890, 20350,  2100,  2291,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Load the pre-trained BERT model for Question Answering
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Number of epochs to train
    per_device_train_batch_size=8,  # Batch size per device
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=qa_dataset,
    eval_dataset=qa_dataset,  # You can create a separate validation dataset if needed
)

# Fine-tune the model
trainer.train()





Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
