In [11]:
import torch
import torchvision
import transformers
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering
from peft import LoraConfig, get_peft_model

In [2]:
dataset = load_dataset("mandarjoshi/trivia_qa", "rc.nocontext")

In [3]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_LENGTH = 256

In [7]:
class DataProcessor:
    def __init__(self, MODEL_NAME, MAX_LENGTH, dataset, mode="train"):
        self.MODEL_NAME = MODEL_NAME
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        self.dataset = dataset
        self.MAX_LENGTH = MAX_LENGTH
        self.mode = mode
        
    # def format_data(self, example):
    #     print(example)
    #     return f"question: {example['question']}\n answer: {example['answer']}"
    
    def tokenizer_function(self, examples):
        formatted_data = [
            f"question: {q}\n answer: {a}"
            for q, a in zip(examples["question"], examples["answer"])
        ]
        return self.tokenizer(
            formatted_data,
            padding=True,
            max_length=self.MAX_LENGTH
            )

In [8]:
data_processor = DataProcessor(MODEL_NAME, MAX_LENGTH, dataset)

In [9]:
train_dataset = dataset['train'].map(data_processor.tokenizer_function, batched=True)

Map:   0%|          | 0/138384 [00:00<?, ? examples/s]

Map: 100%|██████████| 138384/138384 [00:56<00:00, 2429.90 examples/s]


In [10]:
validation_dataset = dataset["validation"].map(data_processor.tokenizer_function, batched=True)

Map: 100%|██████████| 17944/17944 [00:06<00:00, 2615.64 examples/s]


In [12]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
lora_config = LoraConfig(
    base_model_name_or_path=MODEL_NAME,
    r=8,
    target_modules=['q_proj', 'k_proj', 'v_proj'],
    lora_alpha=32,
    lora_dropout=0.1
)

lora_model = get_peft_model(model, lora_config)