# Re-do of Lab3 Finetuning FlanT5 with Reinforcement Learning from Human Feedback

- 🚀 Motivation is to re-write the code (instead of just re-running cells someone else wrote) from scratch to build muscle memory  💪🏾💪🏾💪🏾
- 🧠 The goal is to understand the process of fine-tuning LLMs, specifically using Reinforcement Learning from Human Feedback (RLHF) with FlanT5. To this end, this notebook will be heavily annotated with comments as I learn what specific library and lines of code are doing
- 📍 The ultimate goal is to understand the process of fine-tuning LLMs so that I can apply to mother models like Biological Foundation Models 

In [None]:
#Hugging Face Transformers, Datasets, and PEFT libraries
from transformers import(   
    pipeline,   # hg high-level API for common tasks using LLMs, kinda like scikit-learn.fit() 
    AutoTokenizer, # text tokenizer, converts text to numerical tokens for the model
   AutoModelForSequenceClassification,  # Facebooks pre-trained BERT model for sentiment classification from text
   AutoModelForSeq2SeqLM,  # this sequence-to-sequence model, used fro task like language translation, summarization. This is what Flan-t5 does
   GenerationConfig  # used to configure how the model generates text etc
   ) 

from datasets import load_dataset  # hg high-level API for downloading and reproducibily managing ML datasets, kunda like scikit-learn.load_dataset()
#library for Parameter-Efficient Fine-tuning LLMs 
from peft import (
     PeftModel,   #  wrappers for efficient fine-tuning of LLM, recall that peft updates only small fraction of the model params
     PeftConfig,  
     LoraConfig,  # Low-rank Adaptation, another peft method, which fine-tunes small number of adapter layers instead of the whole model saves lots of memory
     TaskType  # used to specify the type of task I am fine-tunining for 
 )
#trl: Transformers Reinforcement Learning library 
from trl import( PPOTrainer,  #implement the proximal policy optimization algo
                PPOConfig, 
                AutoModelForSeq2SeqLMWithValueHead, #a value head in RLHF is a neural network layer added to the LLm for estimating how good the model's output is
                create_reference_model,

)
from trl.core import LengthSampler # samples sequences from the dataset based on length criteria

import torch 
import evaluate 
import numpy  as np
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

# Load FLAN-T5 Model, Prepare RewardModel and Toxicity Evaluator 

In [None]:
model_name='google/flan-t5-base'
huggingface_dataset_name='knkarthick/dialogsum'
dataset_original = load_dataset(path=huggingface_dataset_name) #path can be the HuggingFace dataset ID or a local path 
dataset_original 

Generating train split: 100%|██████████| 12460/12460 [00:00<00:00, 110431.70 examples/s]
Generating validation split: 100%|██████████| 500/500 [00:00<00:00, 73314.18 examples/s]
Generating test split: 100%|██████████| 1500/1500 [00:00<00:00, 125063.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [8]:
dataset_original['train'].column_names

['id', 'dialogue', 'summary', 'topic']

In [None]:
#inspect first 5 rows in the train split
dataset_original['train'].to_pandas().head(5)

Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


In [None]:
#write function for preprocessing the dataset

def build_dataset(model_name, 
                  dataset_name,
                  input_min_text_length,
                  input_max_text_length,):
    
    #------Load the daset from training
    dataset = load_dataset(dataset_name, split='train')
    #filter the dialogues based on text limits
    dataset = dataset.filter(lambda x: 
                             len(x['dialogue']) >= input_min_text_length and len(x['dialogue']) <= input_max_text_length,batched=False

                             )
    #------Tokenization 
    #instantiate the tokenizer for the model
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name,device_map='auto')
    def tokenize(sample):
        #create instruction prompt template for each piece of dialogue
        #the prompt template is a string that will be used to instruct the model to summarize the dialogue
        prompt = f""" Summarize the following conversation 
                      {sample['dialogue']}
                        Summary:
                """
        sample['input_ids'] = tokenizer.encode(prompt) # encodes text as unique vector integer e.g 'how may i help you' -> [101, 2129, 2026, 1010, 2129, 2017, 102]
        # for RLFH, we need to input ids as query because the the PPO library needs to access the human-readbale text 
        #the model will read the input ids and generate token ids, that the RLHF can then use to score
        sample['query'] = tokenizer.decode(sample['input_ids']) #has to be called 'query' for the PPO library
        
        return sample
    #tokenize each dialogue
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type='torch')
    #Split the dataset into train and test parts
    dataset_splits =  dataset.train_test_split(test_size=0.1, seed=42)
    return dataset_splits

    
#call the function to build the dataset
dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=20,  #minimum length of the dialogue
                        input_max_text_length=1000,  #maximum length of the dialogue
                        )
print(dataset)

Filter: 100%|██████████| 12460/12460 [00:00<00:00, 227484.95 examples/s]
Map: 100%|██████████| 10026/10026 [00:03<00:00, 3120.86 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 9023
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 1003
    })
})





In [None]:
#inspect the processed dataset
dataset['train'].to_pandas().head()

Unnamed: 0,id,dialogue,summary,topic,input_ids,query
0,train_11040,"#Person1#: Good afternoon, Mr. Chen. How are y...",#Person1# helps Mr. Chen change some Hong Kong...,currency exchange,"[12198, 1635, 1737, 8, 826, 3634, 1713, 345, 1...",Summarize the following conversation #Person1#...
1,train_5748,#Person1#: Are you new here?\n#Person2#: Yes. ...,Monica helps Wilson fax an order to the head o...,office talk,"[12198, 1635, 1737, 8, 826, 3634, 1713, 345, 1...",Summarize the following conversation #Person1#...
2,train_12033,"#Person1#: I want to go house shopping, and I ...",#Person2# helps #Person1# calculate the price ...,House affordability evaluation,"[12198, 1635, 1737, 8, 826, 3634, 1713, 345, 1...",Summarize the following conversation #Person1#...
3,train_8325,#Person1#: I'd like to buy one of these refrig...,#Person1# wants to buy a refrigerator. #Person...,easy-payment plan,"[12198, 1635, 1737, 8, 826, 3634, 1713, 345, 1...",Summarize the following conversation #Person1#...
4,train_6503,#Person1#: I am having some plumbing problems ...,#Person1# is having some plumbing problems in ...,plumbing problem,"[12198, 1635, 1737, 8, 826, 3634, 1713, 345, 1...",Summarize the following conversation #Person1#...


# Load and Prepare the PEFT model for RLHF

- This steps sets up the LoRA for efficient fine-tuning. Instead of updating all 247M parameters of the FLAN-T5,
- we add a small adapter layers that will be trained, while the rest of the model remains frozen.
- `Input`: Raw Flan-T5 base model + Pre-trained LoRA adapters (downloaded from S3)
- `Output`: A hybrid peft_model ready for RLHF training

In [15]:
##load the model trained from lab2
import os 
import boto3 

In [16]:
s3 = boto3.client('s3') #set s3 client
bucket_name= 'dlai-generative-ai' #name of the bucket
s3_prefix= 'models/peft-dialogue-summary-checkpoint/'
local_dir='./peft-dialogue-summary-checkpoint-from-s3/' #to save the model

os.makedirs(local_dir, exist_ok=True) 

#download the model from S3 bucket
paginator= s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
    if 'Contents' in page:
        for obj in page['Contents']:
            key = obj['Key']
            local_file = os.path.join(local_dir, key.replace(s3_prefix, ''))
            local_file_dir = os.path.dirname(local_file)
            os.makedirs(local_file_dir, exist_ok=True)
            s3.download_file(bucket_name, key, local_file)
            print(f"Downloaded: {key}")

Downloaded: models/peft-dialogue-summary-checkpoint/adapter_config.json
Downloaded: models/peft-dialogue-summary-checkpoint/adapter_model.bin
Downloaded: models/peft-dialogue-summary-checkpoint/special_tokens_map.json
Downloaded: models/peft-dialogue-summary-checkpoint/tokenizer.json
Downloaded: models/peft-dialogue-summary-checkpoint/tokenizer_config.json


In [22]:
#function to help us track the number of trainable and all model parameters to ensure PEFT is working as expected

def print_number_of_trainable_model_parameters(model) :
    """Prints the number of trainable and all model parameters.
    Args:
        model: a PEFT model object.  This model is PefModel object thats a wrapper around the original model, in this case Flan-T5.
    """
    trainable_model_params=0
    all_model_params=0
    for _, params in model.named_parameters(): #iterates over the parameters in each layer of the model
        all_model_params += params.numel()
        if params.requires_grad:
            trainable_model_params += params.numel()
    print(f"Trainable model parameters: {trainable_model_params}")
    print(f"All model parameters: {all_model_params}")
    print(f"Percentage of trainable model parameters: {trainable_model_params/all_model_params*100:.2f}%")
    return 

#### Add the peft adapter to the original Flan-T5 model. 

In [31]:

#configure the LoRA adapter and how it will be applied to the model
lora_config = LoraConfig(
    r= 32,  #controls rank for decomposition, higher rank means more parameters to train, so more accurate but also higher compute cost 
    lora_alpha=32, 
    target_modules=['q','v'], # add adpaters to the query and value layers of the attention mechanism in the transformer model
    lora_dropout=0.05,
    bias="none",  #no bias in the LoRA layers,
    task_type= TaskType.SEQ_2_SEQ_LM  #Flan-T5 is a sequence-to-sequence model 
)
#load the base model 
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype= torch.float16,  #use float16 for faster training and less memory usage
    device_map='cpu',  #use all available GPUs
)

#here we load the pre-trained LoRA adapter layers from S3, and then wrap the base model with the PEFT model.
#so the `peft_model` is hybrid model that combines the frozen weights of the Flan-T5 base model with the trainable LoRA adapters
peft_model = PeftModel.from_pretrained(
    model=model,
    model_id = local_dir,  #path to the pre-trained LoRA adapter layers downloaded from S3
    lora_config = lora_config,  #LoRA config
    torch_dtype=torch.float16,  
    device_map='cpu',  #use all available GPUs,
    is_trainable=True  #set the model to trainable mode
)


In [23]:
print("PEFT model parameters to be updated:", print_number_of_trainable_model_parameters(peft_model))

Trainable model parameters: 3538944
All model parameters: 251116800
Percentage of trainable model parameters: 1.41%
PEFT model parameters to be updated: None


In [32]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    peft_model,
    torch_dtype=torch.float16,  
    is_trainable=True,  #set the model to trainable mode
    device_map='cpu',  #use all available GPUs
)
#see how many trainable paramters the pp model has 
print("PPO model parameters to be updated (ValueHead + 769 params):", print_number_of_trainable_model_parameters(ppo_model))
ppo_model.v_head
##notic how the number of trainable params for ppo 3,539,713 is slightly higher than the peft model 3,538,944,
#this is because of dimension of the valuehead the PPO model (768)  +bias (1)

Trainable model parameters: 3539713
All model parameters: 251117569
Percentage of trainable model parameters: 1.41%
PPO model parameters to be updated (ValueHead + 769 params): None


ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

In [26]:
ref_model = create_reference_model(ppo_model)  
# this is copy of the PPO model before training, used for KL divergence to compare the newly trained PPO

# Prepare the Reward Model 

- Reinforcement learning is a machine learning paradigm in which agents take action in an environment aimed at maximimized their cumulative rewards. The agent's behavior is deinied by the `policy`.
- The goal of the reinforcement learning is for the agent is learn the optimal policy that maximises the reard function.
- In the previous section, the original policy is based on the PEFT model - .e. the LLM before detoxification.
- Now we will define the reward model encouraging the agent to detoxify the dialogue summaries. The intutive approach is to do sentiment analysis across the two classes (`nothate` and `hate`) and give higher reward if there is chance of gettuing `nothate` class.

# Load the reward model
- The reward model is `Facebooks RoBERTa based on hate-speech classification model`. 
- This model outputs  `logits` and then predict probabilities across the two claseses (`nothate` and `hate`). 
- The logits of the output will be tabken as a positive reward. 
- Then the model will be fine-tuned with `PPO` using those reward values

In [33]:
toxicity_model_name= 'facebook/roberta-hate-speech-dynabench-r4-target'
toxicity_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=toxicity_model_name, device_map='auto')
toxicity_model = AutoModelForSequenceClassification.from_pretrained(
    toxicity_model_name,
    device_map='cpu',
)
print(toxicity_model.config.id2label)

{0: 'nothate', 1: 'hate'}


In [None]:
#test the binary classifier for toxicity sentiment analysis

non_toxic_text='I love you so much, you are the best!'
toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors='pt').input_ids #input_ids is the integer vector of the text in the form of a pytorch tensor
logits = toxicity_model(input_ids=toxicity_input_ids).logits #forward-pass the tokenized text to get the logit predictions
probabilities = logits.softmax(dim=1).tolist()[0]   #softmax maps logits (real nums) to probability scale (0-1)
print(f" Non-toxic Logits [not hate, hate]: {logits.tolist()[0]}")
print(f" Non-toxic Probabilities [not hate, hate]: {probabilities}")

toxic_text='i want to kill you'
toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors='pt').input_ids
logits = toxicity_model(input_ids = toxicity_input_ids).logits
probabilities = logits.softmax(dim=1).tolist()[0]
print(f" Toxic Logits [not hate, hate]: {logits.tolist()[0]}")
print(f" Toxic Probabilities [not hate, hate]: {probabilities}")
# the "not hate" predicted probablities of the model becomes the Reward Score for the PPO model to train it to avoiid hate speech 


 Non-toxic Logits [not hate, hate]: [4.645077228546143, -4.235363006591797]
 Non-toxic Probabilities [not hate, hate]: [0.9998608827590942, 0.00013906363165006042]
 Toxic Logits [not hate, hate]: [-1.392095685005188, 0.9838298559188843]
 Toxic Probabilities [not hate, hate]: [0.08502701669931412, 0.9149730205535889]


In [None]:
#use hugginhface inference pipeline
device=0 if torch.cuda_is_available() else "cpu"

sentiment_pipe = pipeline("sentiment_analysis",
                          model= toxicity_model_name,
                          device='cpu')
reward_logits_kwargs = {
    "top_k":None, #return all scores
    "function_to_apply": "none", 
    "batch_size":16}
reward_probabilities_kwargs = {
    "top_k":None,
    "function_to_apply": "softmax",
    "batch_size":16}



In [None]:
toxicity_evaluator = evaluate.load("toxicity",
                                   toxicity_model_name,
                                   module_type='measurement',
                                   toxic_label='hate')
