<a href="https://colab.research.google.com/github/reshalfahsi/qa-gpt2-lora/blob/master/Question_Answering_GPT_2_PEFT_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Question-Answering using GPT-2's PEFT with LoRA**

## **Important Libraries**

### **Install**

In [40]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

downloading uv 0.5.21 x86_64-unknown-linux-gnu
no checksums to verify
installing to /root/.local/bin
  uv
  uvx
everything's installed!


In [41]:
!uv pip install -q --no-cache-dir --system transformers peft datasets
!uv pip install -q --no-cache-dir --system bitsandbytes accelerate
!uv pip install -q --no-cache-dir --system huggingface-hub
!uv pip install -q --no-cache-dir --system evaluate

### **Import**

In [42]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer, BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling, pipeline

from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from evaluate import load as load_evaluate

from IPython.display import display, Markdown
from tqdm.auto import tqdm

import warnings
import random
import torch
import os

import numpy as np

warnings.filterwarnings("ignore")

## **Configuration**

In [43]:
os.makedirs("experiment", exist_ok=True)
os.makedirs("experiment/model", exist_ok=True)
EXPERIMENT_DIR = "experiment/"

In [44]:
MODEL_NAME = "gpt2"
FINETUNED_MODEL = os.path.join(EXPERIMENT_DIR, "model")
DATASET_NAME = "squad_v2"
METRIC_NAME = "bleu"
MAX_ORDER = 1

In [45]:
LORA_DROPOUT = 2e-1
LORA_RANK = 4
LORA_ALPHA = 8

In [46]:
BATCH_PER_DEVICE = 5
GRADIENT_ACCUMULATION_STEP = 10

In [47]:
LEARNING_RATE = 5e-5
SAMPLE_TEST_SIZE = 100

In [48]:
EVAL_STEP = int(2e2)
LOGGING_STEP = int(2e2)
SAVE_STEP = int(1e2)
WARMUP_STEP = int(1e2)
MAX_STEP = int(1.2e3)

In [49]:
MAX_TOKEN = 128
MAX_LENGTH = 1024

In [50]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

## **Dataset**

### **Load**

In [51]:
qa_dataset_orig = load_dataset(DATASET_NAME)

In [52]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(
    f"The max model length is {tokenizer.model_max_length} for this model, "
    "although the actual embedding size for GPT small is 768"
)
print(
    "The beginning of sequence "
    f"token {tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id)} "
    f"token has the id {tokenizer.bos_token_id}"
)
print(
    "The end of "
    f"sequence token {tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id)} "
    f"has the id {tokenizer.eos_token_id}"
)

tokenizer.pad_token = tokenizer.eos_token

print(
    "The padding "
    f"token {tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id)} "
    f"has the id {tokenizer.pad_token_id}"
)

The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|endoftext|> token has the id 50256
The end of sequence token <|endoftext|> has the id 50256
The padding token <|endoftext|> has the id 50256


### **Utils**

In [53]:
def create_prompt(context, question, answer):
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        index = random.randint(0, len(answer["text"])-1)
        answer = answer["text"][index]

    prompt_template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        f"\nANSWER:\n{answer}"
    )

    return prompt_template


qa_dataset = qa_dataset_orig.map(
    lambda samples: tokenizer(
        create_prompt(
            samples["context"],
            samples["question"],
            samples["answers"],
        ),
        max_length=MAX_LENGTH,
        truncation=True,
    )
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

## **Model**

### **Load**

In [54]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=quantization_config,
)

### **Info**

In [55]:
params = list(base_model.named_parameters())

print(
    'The GPT-2 model has {:} different named parameters.\n'.format(len(params))
)

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The GPT-2 model has 148 different named parameters.

==== Embedding Layer ====

transformer.wte.weight                                  (50257, 768)
transformer.wpe.weight                                   (1024, 768)

==== First Transformer ====

transformer.h.0.ln_1.weight                                   (768,)
transformer.h.0.ln_1.bias                                     (768,)
transformer.h.0.attn.c_attn.weight                       (2304, 768)
transformer.h.0.attn.c_attn.bias                             (2304,)
transformer.h.0.attn.c_proj.weight                        (768, 768)
transformer.h.0.attn.c_proj.bias                              (768,)
transformer.h.0.ln_2.weight                                   (768,)
transformer.h.0.ln_2.bias                                     (768,)
transformer.h.0.mlp.c_fc.weight                          (3072, 768)
transformer.h.0.mlp.c_fc.bias                                (3072,)
transformer.h.0.mlp.c_proj.weight                        (768,

### **PEFT with LoRA**

In [56]:
lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=[
        "c_attn",
        "c_fc",
        "c_proj",
    ],
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_config)

## **Training**

In [57]:
trainer = Trainer(
    model=model,
    train_dataset=qa_dataset["train"],
    eval_dataset=qa_dataset["validation"],
    args=TrainingArguments(
        output_dir=EXPERIMENT_DIR,
        per_device_train_batch_size=BATCH_PER_DEVICE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEP,
        eval_strategy="steps",
        eval_steps=EVAL_STEP,
        save_steps=SAVE_STEP,
        warmup_steps=WARMUP_STEP,
        max_steps=MAX_STEP,
        logging_steps=LOGGING_STEP,
        learning_rate=LEARNING_RATE,
        fp16=True,
        report_to="none",
    ),
    data_collator=DataCollatorForLanguageModeling(
        tokenizer,
        mlm=False,
    ),
)
trainer.train()

Step,Training Loss,Validation Loss
200,34.6552,3.053641
400,31.6501,2.994655
600,31.2964,2.975799
800,31.1063,2.967888
1000,31.1492,2.964367
1200,31.0633,2.963042


TrainOutput(global_step=1200, training_loss=31.820098063151043, metrics={'train_runtime': 3772.961, 'train_samples_per_second': 15.903, 'train_steps_per_second': 0.318, 'total_flos': 8351521283635200.0, 'train_loss': 31.820098063151043, 'epoch': 0.4604051565377532})

In [58]:
model.save_pretrained(FINETUNED_MODEL)
tokenizer.save_pretrained(FINETUNED_MODEL)

('experiment/model/tokenizer_config.json',
 'experiment/model/special_tokens_map.json',
 'experiment/model/vocab.json',
 'experiment/model/merges.txt',
 'experiment/model/added_tokens.json',
 'experiment/model/tokenizer.json')

## **Testing**

In [59]:
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
tokenizer.pad_token = tokenizer.eos_token

qa_metric = load_evaluate(METRIC_NAME)
predictions = list()
references = list()

SEED = int(np.random.randint(2147483647))
test_data = qa_dataset_orig["validation"].shuffle(seed=SEED).select(
    range(SAMPLE_TEST_SIZE)
)


for qa_data in tqdm(test_data):

    answer = qa_data['answers']

    if len(answer["text"]) < 1:
        answer = ["Cannot Find Answer"]

    references.append(answer)

    context = qa_data['context']
    question = qa_data['question']

    template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        "\nANSWER:\n"
    )

    pred_index = len(template)

    input = tokenizer(
        template,
        return_tensors='pt',
        return_token_type_ids=False,
    )
    input = input.to(device='cuda' if torch.cuda.is_available() else "cpu")

    with torch.cuda.amp.autocast():
        model.enable_adapter_layers()
        output_tokens_qa = model.generate(
            **input,
            max_new_tokens=MAX_TOKEN,
            pad_token_id=tokenizer.pad_token_type_id,
        )

    answer = tokenizer.decode(
        output_tokens_qa[0],
        skip_special_tokens=True,
    ).replace(template, '')
    try:
        answer = answer[:answer.find("\n\nANSWER:\n")]
    except:
        pass

    predictions.append(answer)

  0%|          | 0/100 [00:00<?, ?it/s]

In [60]:
results = qa_metric.compute(
    predictions=predictions,
    references=references,
    max_order=MAX_ORDER,
)
print(results)

{'bleu': 0.16666666666666669, 'precisions': [0.16666666666666666], 'brevity_penalty': 1.0, 'length_ratio': 3.029126213592233, 'translation_length': 624, 'reference_length': 206}


## **Inference**

### **Utils**

In [65]:
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [66]:
def make_inference(context, question):

    template = (
        f"CONTEXT:\n{context}\n"
        f"\nQUESTION:\n{question}\n"
        "\nANSWER:\n"
    )

    # turn the input into tokens
    input = tokenizer(
        template,
        return_tensors='pt',
        return_token_type_ids=False,
    )
    # move the tokens onto the GPU, for inference
    input = input.to(device='cuda' if torch.cuda.is_available() else "cpu")

    # make an inference
    with torch.cuda.amp.autocast():
        model.enable_adapter_layers()
        output_tokens_qa = model.generate(
            **input,
            max_new_tokens=MAX_TOKEN,
            pad_token_id=tokenizer.pad_token_type_id,
        )

    result = tokenizer.decode(output_tokens_qa[0], skip_special_tokens=True)
    try:
        result = result[
            :result.find(
                "\n\nANSWER:\n",
                result.find("\n\nANSWER:\n") + 1,
            )
        ]
    except:
        pass

    result = result.replace(
        "CONTEXT:", "**CONTEXT:**"
    ).replace(
        "QUESTION:", "**QUESTION:**"
    ).replace(
        "ANSWER:", "**ANSWER:**"
    )

    # display results
    display(Markdown("\n# Question-Answering with GPT-2 \n"))
    display(
        Markdown(
            (result)
        )
    )

### **Run**

In [71]:
CONTEXT = "The trophy doesn't fit into the brown suitcase because it's too small." # @param {type:"string"}
QUESTION = "What is too small?"  # @param {type:"string"}


In [72]:
# ANSWER:
make_inference(CONTEXT, QUESTION)


# Question-Answering with GPT-2 


**CONTEXT:**
The trophy doesn't fit into the brown suitcase because it's too small.

**QUESTION:**
What is too small?

**ANSWER:**
brown suitcase.