<a href="https://colab.research.google.com/github/reshalfahsi/qa-gpt2-lora/blob/master/Question_Answering_GPT_2_PEFT_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Question-Answering using GPT-2's PEFT with LoRA**

## **Important Libraries**

### **Install**

In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

In [None]:
!uv pip install -q --no-cache-dir --system transformers peft datasets
!uv pip install -q --no-cache-dir --system bitsandbytes accelerate
!uv pip install -q --no-cache-dir --system huggingface-hub
!uv pip install -q --no-cache-dir --system evaluate

### **Import**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer, BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling, pipeline

from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from evaluate import load as load_evaluate

from IPython.display import display, Markdown
from tqdm.auto import tqdm

import warnings
import random
import torch
import os

import numpy as np

warnings.filterwarnings("ignore")

## **Configuration**

In [None]:
os.makedirs("experiment", exist_ok=True)
os.makedirs("experiment/model", exist_ok=True)
EXPERIMENT_DIR = "experiment/"

In [None]:
MODEL_NAME = "gpt2"
FINETUNED_MODEL = os.path.join(EXPERIMENT_DIR, "model")
DATASET_NAME = "squad_v2"
METRIC_NAME = "bleu"
MAX_ORDER = 1

In [None]:
LORA_DROPOUT = 2e-1
LORA_RANK = 4
LORA_ALPHA = 8

In [None]:
BATCH_PER_DEVICE = 5
GRADIENT_ACCUMULATION_STEP = 10

In [None]:
LEARNING_RATE = 5e-5
SAMPLE_TEST_SIZE = 100

In [None]:
EVAL_STEP = int(2e2)
LOGGING_STEP = int(2e2)
SAVE_STEP = int(1e2)
WARMUP_STEP = int(1e2)
MAX_STEP = int(1.2e3)

In [None]:
MAX_TOKEN = 128
MAX_LENGTH = 1024

In [None]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

## **Dataset**

### **Load**

In [None]:
qa_dataset_orig = load_dataset(DATASET_NAME)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(
    f"The max model length is {tokenizer.model_max_length} for this model, "
    "although the actual embedding size for GPT small is 768"
)
print(
    "The beginning of sequence "
    f"token {tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id)} "
    f"token has the id {tokenizer.bos_token_id}"
)
print(
    "The end of "
    f"sequence token {tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id)} "
    f"has the id {tokenizer.eos_token_id}"
)

tokenizer.pad_token = tokenizer.eos_token

print(
    "The padding "
    f"token {tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id)} "
    f"has the id {tokenizer.pad_token_id}"
)

### **Utils**

In [None]:
def create_prompt(context, question, answer):
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        index = random.randint(0, len(answer["text"])-1)
        answer = answer["text"][index]

    prompt_template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        f"\nANSWER:\n{answer}"
    )

    return prompt_template


qa_dataset = qa_dataset_orig.map(
    lambda samples: tokenizer(
        create_prompt(
            samples["context"],
            samples["question"],
            samples["answers"],
        ),
        max_length=MAX_LENGTH,
        truncation=True,
    )
)

## **Model**

### **Load**

In [None]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=quantization_config,
)

### **Info**

In [None]:
params = list(base_model.named_parameters())

print(
    'The GPT-2 model has {:} different named parameters.\n'.format(len(params))
)

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

### **PEFT with LoRA**

In [None]:
lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=[
        "c_attn",
        "c_fc",
        "c_proj",
    ],
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_config)

## **Training**

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=qa_dataset["train"],
    eval_dataset=qa_dataset["validation"],
    args=TrainingArguments(
        output_dir=EXPERIMENT_DIR,
        per_device_train_batch_size=BATCH_PER_DEVICE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEP,
        eval_strategy="steps",
        eval_steps=EVAL_STEP,
        save_steps=SAVE_STEP,
        warmup_steps=WARMUP_STEP,
        max_steps=MAX_STEP,
        logging_steps=LOGGING_STEP,
        learning_rate=LEARNING_RATE,
        fp16=True,
        report_to="none",
    ),
    data_collator=DataCollatorForLanguageModeling(
        tokenizer,
        mlm=False,
    ),
)
trainer.train()

In [None]:
model.save_pretrained(FINETUNED_MODEL)
tokenizer.save_pretrained(FINETUNED_MODEL)

## **Testing**

In [None]:
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
tokenizer.pad_token = tokenizer.eos_token

qa_metric = load_evaluate(METRIC_NAME)
predictions = list()
references = list()

SEED = int(np.random.randint(2147483647))
test_data = qa_dataset_orig["validation"].shuffle(seed=SEED).select(
    range(SAMPLE_TEST_SIZE)
)


for qa_data in tqdm(test_data):

    answer = qa_data['answers']

    if len(answer["text"]) < 1:
        answer = ["Cannot Find Answer"]

    references.append(answer)

    context = qa_data['context']
    question = qa_data['question']

    template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        "\nANSWER:\n"
    )

    pred_index = len(template)

    input = tokenizer(
        template,
        return_tensors='pt',
        return_token_type_ids=False,
    )
    input = input.to(device='cuda' if torch.cuda.is_available() else "cpu")

    with torch.cuda.amp.autocast():
        model.enable_adapter_layers()
        output_tokens_qa = model.generate(
            **input,
            max_new_tokens=MAX_TOKEN,
            pad_token_id=tokenizer.pad_token_type_id,
        )

    answer = tokenizer.decode(
        output_tokens_qa[0],
        skip_special_tokens=True,
    ).replace(template, '')
    try:
        answer = answer[:answer.find("\n\nANSWER:\n")]
    except:
        pass

    predictions.append(answer)

In [None]:
results = qa_metric.compute(
    predictions=predictions,
    references=references,
    max_order=MAX_ORDER,
)
print(results)

## **Inference**

### **Utils**

In [None]:
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def make_inference(context, question):

    template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        "\nANSWER:\n"
    )

    # turn the input into tokens
    input = tokenizer(
        template,
        return_tensors='pt',
        return_token_type_ids=False,
    )
    # move the tokens onto the GPU, for inference
    input = input.to(device='cuda' if torch.cuda.is_available() else "cpu")

    # make an inference
    with torch.cuda.amp.autocast():
        model.enable_adapter_layers()
        output_tokens_qa = model.generate(
            **input,
            max_new_tokens=MAX_TOKEN,
            pad_token_id=tokenizer.pad_token_type_id,
        )

    result = tokenizer.decode(output_tokens_qa[0], skip_special_tokens=True)
    try:
        result = result[
            :result.find(
                "\n\nANSWER:\n",
                result.find("\n\nANSWER:\n") + 1,
            )
        ]
    except:
        pass

    result = result.replace(
        "CONTEXT:", "**CONTEXT:**"
    ).replace(
        "QUESTION:", "**QUESTION:**"
    ).replace(
        "ANSWER:", "**ANSWER:**"
    )

    # display results
    display(Markdown("\n# Question-Answering with GPT-2 \n"))
    display(Markdown("\n  \n"))
    display(
        Markdown(
            (result)
        )
    )

### **Run**

In [None]:
CONTEXT = "The trophy doesn't fit into the brown suitcase because it's too small." # @param {type:"string"}
QUESTION = "What is too small?"  # @param {type:"string"}


In [None]:
# ANSWER:
make_inference(CONTEXT, QUESTION)