<a href="https://colab.research.google.com/github/sayanbanerjee32/coursera-gen-ai-course-labs/blob/main/coursera_gen_ai_course_lab_week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
        torch==1.13.1 \
        torchdata==0.5.1 --quiet

%pip install \
        transformers==4.27.2 \
        datasets==2.11.0 \
        evaluate==0.4.0 \
        rouge_score==0.1.2 \
        loralib==0.1.1 \
        peft==0.3.0 \
        trl==0.4.4 --quiet

%pip install -Uq pynvml

[0m

In [18]:
from datasets import load_dataset
from transformers import (pipeline, AutoModelForSequenceClassification,
                          AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig,
                          TrainingArguments, Trainer)
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model

from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import time
import gc
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model_name = 'google/flan-t5-base'
hf_dataset_name = "knkarthick/dialogsum"
dataset_original = load_dataset(hf_dataset_name)
dataset_original

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [7]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length,
                  input_max_text_length):
    dataset = load_dataset(dataset_name, split = "train")
    dataset = dataset.filter(lambda x: len(x['dialogue'])>input_min_text_length and len(x['dialogue']) <= input_max_text_length, batched = False)
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    dataset = dataset.map(tokenize, batched = False)
    dataset.set_format(type = "torch")
    dataset_splits = dataset.train_test_split(test_size = 0.2, shuffle = False, seed = 42)
    return dataset_splits

dataset = build_dataset(model_name = model_name,
                  dataset_name = hf_dataset_name,
                  input_min_text_length = 200,
                  input_max_text_length = 1000)



In [26]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                                       torch_dtype = torch.bfloat16,
                                                        device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npct of trainable model parameters: {(trainable_model_params * 100)/all_model_params}%"
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
pct of trainable model parameters: 100.0%


In [19]:
def tokenize_function(example):
    start_prompt = 'Summarise the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding = "max_length", truncation = True,
                                     return_tensors = 'pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding = "max_length", truncation = True,
                                     return_tensors = 'pt').input_ids
    return example
tokenized_datasets = dataset_original.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['id','topic','dialogue','summary'])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [21]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index %100 == 0, with_indices = True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [22]:
print(f"Shapes of datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Shapes of datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
})


In [23]:
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()
report_gpu()

GPU:0
process       2024 uses      680.000 MB GPU memory


In [12]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha=32,
    target_modules=["q","w"],
    lora_dropout=0.05,
    bias="none",
    task_type = TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [17]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1769472
all model parameters: 249347328
pct of trainable model parameters: 0.7096414524241463%


In [24]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 1,
    logging_steps =1,
    max_steps = 1
)
peft_trainer = Trainer(
    model = peft_model,
    args = peft_training_args,
    train_dataset=tokenized_datasets['train'],
)

In [25]:
peft_trainer.train()

peft_model_path = "./peft_model_path"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss
1,50.75


('./peft_model_path/tokenizer_config.json',
 './peft_model_path/special_tokens_map.json',
 './peft_model_path/tokenizer.json')

In [29]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type = TaskType.SEQ_2_SEQ_LM # FLAN-T5
)
peft_model_base =  AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                                         torch_dtype = torch.bfloat16)
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       lora_config = lora_config,
                                       torch_dtype = torch.bfloat16,
                                       device_map = "auto",
                                       is_trainable = True)
print(print_number_of_trainable_model_parameters(peft_model))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable model parameters: 1769472
all model parameters: 249347328
pct of trainable model parameters: 0.7096414524241463%


In [33]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                               torch_dtype = torch.bfloat16,
                                                                is_trainable = True)
print(f'PPO model parameters to be updated (valueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (valueHead + 769 params):
trainable model parameters: 1770241
all model parameters: 249348097
pct of trainable model parameters: 0.7099476680586017%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [34]:
ref_model = create_reference_model(ppo_model)
print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:
trainable model parameters: 0
all model parameters: 249348097
pct of trainable model parameters: 0.0%



In [36]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map = "auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map = "auto")
print(toxicity_model.config.id2label)

{0: 'nothate', 1: 'hate'}


In [46]:
non_toxic_test = "I want to kiss you"
toxicity_input_ids = toxicity_tokenizer(non_toxic_test, return_tensors="pt").input_ids.to(device)

logits = toxicity_model(input_ids = toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

probabilities = logits.softmax(dim =-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

not_hate_index = 0
not_hate_reward = (logits[:,not_hate_index]).tolist()
print(f'reward (high):{not_hate_reward}')

logits [not hate, hate]: [4.657958030700684, -4.078615188598633]
probabilities [not hate, hate]: [0.9998394250869751, 0.00016057751781772822]
reward (high):[4.657958030700684]


In [53]:
toxic_text = "You are disgusting and terrible and i damn hate you"
# highly suspicious - making i > I changes the classification
toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids.to(device)

logits = toxicity_model(input_ids = toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

probabilities = logits.softmax(dim =-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

not_hate_reward = (logits[:,not_hate_index]).tolist()
print(f'reward (low):{not_hate_reward}')


logits [not hate, hate]: [-2.061073064804077, 1.5835477113723755]
probabilities [not hate, hate]: [0.02546585723757744, 0.9745341539382935]
reward (low):[-2.061073064804077]


In [None]:
sentiment_pipe = pipeline("sentiment-analysis",
                          model = toxicity_model_name,
                          device = device)

reward_logits_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None,
    "function_to_apply": "softmax",
    "batch_size": 16
}

print("Reward model output for non-toxic text")
print(sentiment_pipe(non_toxic_test, **rewards_logits_kwargs))
print(sentiment_pipe(non_toxic_test, **reward_probabilities_kwargs))
print("\nReward model output for toxic text")
print(sentiment_pipe(toxic_test, **rewards_logits_kwargs))
print(sentiment_pipe(toxic_test, **reward_probabilities_kwargs))

In [None]:
toxicity_evaluator = evaluate.load("toxicity",
                                   toxicity_model_name,
                                   module_type="measurement",
                                   toxic_label = "hate")