[Source link](https://github.com/keitazoumana/Medium-Articles-Notebooks/blob/main/FLAN_T5_Finetuning_QA_Yahoo_Data.ipynb)

[Theory](https://www.datacamp.com/tutorial/flan-t5-tutorial)

[Dataset](https://huggingface.co/datasets/FinGPT/fingpt-fiqa_qa?row=1)

Instruction fine tuning

# Install libraries

In [None]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub

!pip install peft
!pip install -q -U bitsandbytes scipy einops accelerate trl

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

# Import libraries

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split

from transformers import BitsAndBytesConfig
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftConfig
import torch

# Evaluation before fine tuning

In [None]:
!pip install evaluate
!pip install datasets
!pip install rouge_score

In [2]:
# Import the load function from the evaluate module
from evaluate import load
from datasets import load_dataset
from tqdm.auto import tqdm
import torch

In [4]:
fiqa_dataset = load_dataset("FinGPT/fingpt-fiqa_qa")

In [5]:
# dataset
fiqa_dataset = fiqa_dataset["train"].train_test_split(test_size=0.3)
fiqa_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 11977
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 5133
    })
})

In [8]:
inputs = [i + " : " + p for i, p in zip(fiqa_dataset["test"]["instruction"], fiqa_dataset["test"]["input"])]
fiqa_dataset["test"] = fiqa_dataset["test"].add_column("inputs", inputs)

Flattening the indices:   0%|          | 0/5133 [00:00<?, ? examples/s]

In [10]:
fiqa_dataset["test"]["inputs"][0]

'Utilize your financial knowledge, give your answer or opinion to the input question or subject . Answer format is not limited. : Buy home and leverage roommates, or split rent?'

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
# tokenizer = T5Tokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [11]:
# Loading the 'rouge' metric from the library
rouge = load('rouge')

# Define your predictions and references
# predictions = ["Your summary 1", "Your summary 2"]
# references = ["Reference summary 1", "Reference summary 2"]

# Compute the scores
# results = rouge.compute(predictions=predictions, references=dataset['test']['answer'])

# Print the scores
# print(results)

In [15]:
evaluate_on_test(model,tokenizer)

Evaluating:   0%|          | 0/321 [00:00<?, ?it/s]

{'rouge1': 0.09298207882468537, 'rouge2': 0.015535059508472273, 'rougeL': 0.07032324386140854, 'rougeLsum': 0.07035524207449967}


In [14]:
max_source_length = 512
def classify(texts_to_classify: str,model,tokenizer):
    """Classify a batch of texts using the model."""
    inputs = tokenizer(
        texts_to_classify,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    # inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            num_beams=2,
            early_stopping=True,
        )

    predictions = [
        tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return predictions

In [13]:
def evaluate_on_test(model,tokenizer):
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    batch_size = 16  # Adjust batch size based GPU capacity
    num_batches = len(fiqa_dataset['test']) // batch_size + (
        0 if len(fiqa_dataset['test']) % batch_size == 0 else 1
    )
    progress_bar = tqdm(total=num_batches, desc="Evaluating")

    for i in range(0, len(fiqa_dataset['test']), batch_size):
        batch_texts = fiqa_dataset['test']["inputs"][i : i + batch_size]
        batch_labels = fiqa_dataset['test']["output"][i : i + batch_size]

        batch_predictions = classify(batch_texts,model,tokenizer)

        predictions_list.extend(batch_predictions)
        labels_list.extend([str(label) for label in batch_labels])

        progress_bar.update(1)

    progress_bar.close()
    # report = classification_report(labels_list, predictions_list)
    results = rouge.compute(predictions = predictions_list , references = labels_list)
    print(results)

# Load model and tokenizer

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
# model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Prepare and preprocess the model for PeFT training

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True,)

model.config.use_cache = False

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v", "k", "o"], # you have to know the target modules, it varies from model to model
    lora_dropout=0.05,
    bias="none",
    # task_type="CAUSAL_LM"
    task_type="SEQ_2_SEQ_LM"
)

# SEQ_CLS = "SEQ_CLS"
#     SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
#     CAUSAL_LM = "CAUSAL_LM"
#     TOKEN_CLS = "TOKEN_CLS"
#     QUESTION_ANS = "QUESTION_ANS"
#     FEATURE_EXTRACTION = "FEATURE_EXTRACTION"


model = get_peft_model(model, config) # Wrap the base model with get_peft_model() to get a trainable PeftModel

# Trainable parameters

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 1769472 | total: 249347328 | Percentage: 0.7096%


# Preparing Data for Fine-Tuning

In [None]:
# Acquire the training data from Hugging Face
# DATA_NAME = "yahoo_answers_qa"
# yahoo_answers_qa = load_dataset(DATA_NAME)

fiqa_dataset = load_dataset("FinGPT/fingpt-fiqa_qa")

Downloading readme:   0%|          | 0.00/522 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17110 [00:00<?, ? examples/s]

In [None]:
fiqa_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 17110
    })
})

In [None]:
# yahoo_answers_qa["train"]["answer"][0]
# yahoo_answers_qa["train"]["question"][0]
fiqa_dataset["train"]["input"][0]

'What is considered a business expense on a business trip?'

In [None]:
# yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=0.3)
fiqa_dataset = fiqa_dataset["train"].train_test_split(test_size=0.3)

In [None]:
fiqa_dataset["train"]["input"]

In [None]:
# fiqa_dataset = fiqa_dataset['train'].to_pandas()
# # df = df.dropna(subset=['sentence', 'label']) ## drop missing values
# fiqa_dataset_train, fiqa_dataset_test, = train_test_split(fiqa_dataset, test_size=0.2, random_state=42)
# print(fiqa_dataset_train.shape, fiqa_dataset_test.shape)
# print(len(fiqa_dataset_train['input']))

(13688, 3) (3422, 3)
13688


# Data formatting and tokenization

During the inference mode, the process of calling the model will be in this format:

“Please answer this question: < USER_QUESTION >”

In addition to the formatting, the function also applies the tokenization of the inputs and outputs using the tokenizer function.

In [None]:
# We prefix our tasks with "answer the question"
#prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(example):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   #  instruct = [i + " : " for i in example["instruction"]]
   #  inp = [p for p in example["input"]]
   #  inputs = [instruct[index] + inp[index] for index in range(len(instruct))]

   inputs = [i + " : " + p for i, p in zip(example["instruction"], example["input"])]

   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=example["output"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

Next, the function is applied to the whole dataset using the map function below:

In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset = fiqa_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/11977 [00:00<?, ? examples/s]

Map:   0%|          | 0/5133 [00:00<?, ? examples/s]

# FLAN-T5 Training and Fine-Tuning

Two of the most common metrics to evaluate the performance of a text generation model are BLEU and ROUGE, and in this case, to evaluate the quality of an answer by comparing it to a reference answer.

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

# Training process

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 10

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

Next, the trainer is set up to trigger the training process of the model.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Finally, the model training is triggered using the train function as follows:

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.4344,3.201813,0.101218,0.017303,0.078799,0.091968
2,3.4054,3.185976,0.09659,0.017616,0.074929,0.087591
3,3.3838,3.176745,0.098379,0.017958,0.076596,0.089215
4,3.368,3.171203,0.101287,0.019078,0.078545,0.09148
5,3.3534,3.164698,0.100724,0.018809,0.078063,0.090958
6,3.3573,3.161256,0.099235,0.018885,0.076979,0.089676
7,3.3392,3.158127,0.099917,0.018596,0.077751,0.090374
8,3.3327,3.156823,0.100605,0.018712,0.077839,0.090898
9,3.3275,3.155356,0.098335,0.01806,0.076301,0.088994




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.4344,3.201813,0.101218,0.017303,0.078799,0.091968
2,3.4054,3.185976,0.09659,0.017616,0.074929,0.087591
3,3.3838,3.176745,0.098379,0.017958,0.076596,0.089215
4,3.368,3.171203,0.101287,0.019078,0.078545,0.09148
5,3.3534,3.164698,0.100724,0.018809,0.078063,0.090958
6,3.3573,3.161256,0.099235,0.018885,0.076979,0.089676
7,3.3392,3.158127,0.099917,0.018596,0.077751,0.090374
8,3.3327,3.156823,0.100605,0.018712,0.077839,0.090898
9,3.3275,3.155356,0.098335,0.01806,0.076301,0.088994
10,3.3267,3.15487,0.099195,0.018541,0.076853,0.089783


TrainOutput(global_step=14980, training_loss=3.3642120239094835, metrics={'train_runtime': 25227.3918, 'train_samples_per_second': 4.748, 'train_steps_per_second': 0.594, 'total_flos': 8036132072638464.0, 'train_loss': 3.3642120239094835, 'epoch': 10.0})

Let’s understand the above performance metrics board.

**Training Loss and Validation Loss:** Lower values in these metrics are preferable as they indicate a better fit of the model to the data. The training and validation loss has decreased over the epochs, with the lowest values recorded in epoch 3.

**Rouge Metrics** (Rouge1, Rouge2, Rougel, and Rougelsum): Higher values in these metrics are preferable as they indicate better text summarization performance. Across all four Rouge metrics, the values have increased over the epochs, with the highest values recorded in epoch 3.

# Evaluation Results

In [None]:
trainer.evaluate()



{'eval_loss': 3.154869556427002,
 'eval_rouge1': 0.09919500766073303,
 'eval_rouge2': 0.018541485039672838,
 'eval_rougeL': 0.07685284879841214,
 'eval_rougeLsum': 0.08978254934739696,
 'eval_runtime': 1869.1408,
 'eval_samples_per_second': 2.746,
 'eval_steps_per_second': 0.687,
 'epoch': 10.0}

# Save the adapter and Merge it with the base model(for saving within Colab)

One thing to keep in mind is that you can’t merge the 8 bit/4 bit base model with Lora (as of right now) so you have to reload the model with full precision.

In [None]:
trainer.save_model("FlanT5_fin_QA_finetuned")

In [None]:
#To merge base model with fine tuned model
repo_id = "google/flan-t5-base"
use_ram_optimized_load=False

base_model = T5ForConditionalGeneration.from_pretrained(
    repo_id,
    device_map='auto',
    trust_remote_code=True,
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [None]:
base_model.config.use_cache = False

# Load Lora adapter
model = PeftModel.from_pretrained(
    base_model,
    "FlanT5_fin_QA_finetuned",
    )



In [None]:
merged_model = model.merge_and_unload()

# SAVE MODEL

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# SAVE
REPOSITORY_ID="steve1989/FlanT5_financial_question_answering_finetuned"
# tokenizer.save_pretrained(REPOSITORY_ID)
# trainer.create_model_card()
# trainer.push_to_hub()
merged_model.push_to_hub(REPOSITORY_ID)
tokenizer.push_to_hub(REPOSITORY_ID)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/steve1989/FlanT5_financial_question_answering_finetuned/commit/83e934dd07920a305f86dd19e0285365a8bb79f2', commit_message='Upload tokenizer', commit_description='', oid='83e934dd07920a305f86dd19e0285365a8bb79f2', pr_url=None, pr_revision=None, pr_num=None)

# Model Inference

In [None]:
last_checkpoint = "./results/checkpoint-22500"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Define a specific question to answer

In [None]:
sample_instruction = "Offer your insights or judgment on the input financial query or topic using your financial expertise. Reply as normal question answering"
inputs = "Claiming business expenses for a business with no income"

inputs = sample_instruction + " : " + inputs

Run the prediction

In [None]:
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
from textwrap import fill

print(fill(res, width=80))