In [1]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm  # For progress bar

In [2]:
# 1. Model and Tokenizer (same as before)
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [3]:
# 2. Streaming Datasets (same as before)
scienceqa_train = load_dataset("derek-thomas/ScienceQA", "default", split="train", streaming=True)
scienceqa_validation = load_dataset("derek-thomas/ScienceQA", "default", split="validation", streaming=True)

In [15]:
# 3. Preprocessing Function (same as before)
# Tokenization function
def preprocess_function(examples):
    """
    Tokenizes the input questions and answers.
    """
#     print(f'keys {examples.keys()}')
    inputs = [f"Question: {q} Context: {c} Answer Choices: {a}" 
              for q, c, a in zip(examples["question"], examples["subject"], examples["choices"])]
#     print(f'inputs 0 {inputs[0]}')
#     print(f'example question {examples["question"][0]} ')
#     print(f'example choices {examples["choices"][0]} ')
#     print(f'example answer {examples["answer"][0]} ')
#     answers = [ [ans,examples["choices"][t][ans]] 
#               for t in examples["answer"][t][ans]]
    answers = []
    questions = []
    index = 0
    for e in examples["question"]:
        choices = examples["choices"][index]
        question_and_choices = e +" Choices are "+ ", ".join(choices)+ "."
#         print(f'question {question_and_choices}')
        answer_index = examples["answer"][index]
        answer = choices[answer_index]
#         print(f'ans: {answer}')
        questions.append(question_and_choices)
        answers.append(answer)
#         question_answer_pairs.append({"question": question_and_choices, 
#                                      "answer": answer})
        index+=1
        
    
#     print(f'sample answers {answers[0]}')
#     labels = [tokenizer(ans, padding="max_length", truncation=True, max_length=128).input_ids 
#               for ans in examples["answer"]]
    tokenized_answers = [tokenizer(ans, padding="max_length", truncation=True, max_length=256) for ans in answers]
    

    # Tokenize the inputs
    
    tokenized_inputs = tokenizer(questions, padding="max_length", truncation=True, max_length=256)
    
    tokenized_inputs["answers"] = tokenized_answers
    return tokenized_inputs


In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./scienceqa_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    fp16=True if torch.cuda.is_available() else False,  # Use mixed precision training if GPU available
    max_steps=5,  # Explicitly set max steps to avoid error
)
    



In [17]:
# 4. Data Loaders
batch_size = 8  # Adjust as needed

train_dataset = scienceqa_train.map(
    preprocess_function, batched=True, remove_columns=scienceqa_train.column_names
).with_format("torch") # Important: Convert to PyTorch tensors

eval_dataset = scienceqa_validation.map(
    preprocess_function, batched=True, remove_columns=scienceqa_validation.column_names
).with_format("torch")

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)

In [18]:
# 5. Training Loop
# device = torch.device("cuda" if torch.cuda.is_available() else "mps")
# model.to(device)

# optimizer = AdamW(model.parameters(), lr=5e-5)  # Learning rate
# num_epochs = 3
metric = evaluate.load("squad")

In [19]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


In [20]:
# Fine-tune the model
# trainer.train()

In [21]:
fp16=True if torch.cuda.is_available() else False,  # Use mixed precision training if GPU available


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)  # Learning rate
num_epochs = 3
metric = evaluate.load("squad")

for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(desc=f"Epoch {epoch+1}") # Progress bar
    for batch in train_dataloader:
        print(f'batch {batch}')
        batch = {torch.FloatTensor(k).to(device): torch.FloatTensor(v).to(device) for k, v in batch.items()} # Move batch to device
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    progress_bar.close()

Epoch 1: 0it [00:41, ?it/s]


RuntimeError: Could not infer dtype of tokenizers.Encoding