In [None]:
!pip install datasets
!pip install peft
!pip install trl
!pip install evaluate

In [None]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
from datasets import load_dataset
from peft import PeftConfig, PeftModel, LoraConfig, TaskType
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

In [None]:
model_name = 'google/flan-t5-base'
huggingface_dataset_name = 'knkarthick/dialogsum'

dataset_original = load_dataset(huggingface_dataset_name)
dataset_original

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length,
                  input_max_text_length):

  dataset = load_dataset(dataset_name, split = "train")
  dataset = dataset.filter(lambda x: len(x['dialogue']) > input_min_text_length and len(x['dialogue']) <= input_max_text_length, batched = False)
  tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = 'auto')

  def tokenize(sample):

    prompt = f"""
  Summarize the following conversation:

  {sample["dialogue"]}

  Summary:
  """

    sample['input_ids'] = tokenizer.encode(prompt)
    sample['query'] = tokenizer.decode(sample['input_ids'])

    return sample

  dataset = dataset.map(tokenize, batched = False)
  dataset.set_format(type = "torch")
  dataset_splits = dataset.train_test_split(test_size = 0.2, shuffle = False, seed = 42)

  return dataset_splits

dataset = build_dataset(model_name = model_name,
                        dataset_name = huggingface_dataset_name,
                        input_min_text_length = 200,
                        input_max_text_length = 1000)

print(dataset)

In [None]:
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for name, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()

  return f"Trainable model parameters: {trainable_model_params}\nAll model parameters: {all_model_params}\n% of trainable model parameters: {100*trainable_model_params / all_model_params:.2f}%"

In [None]:
peft_dialogue_summary_checkpoint = 'intotheverse/peft-dialogue-summary-checkpoint'

In [None]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    target_modules = ["q","v"],
    lora_dropout = 0.05,
    bias = "None",
    task_type = TaskType.SEQ_2_SEQ_LM
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)

peft_model = PeftModel.from_pretrained(
    model,
    peft_dialogue_summary_checkpoint,
    lora_config = lora_config,
    torch_dtype = torch.bfloat16,
    device_map="auto",
    is_trainable = True
)

print(f"PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}")

In [None]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                          torch_dtype=torch.bfloat16,
                                                          is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

In [None]:
ref_model = create_reference_model(ppo_model)
print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

In [None]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map = "auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map = "auto")
print(toxicity_model.config.id2label)

In [None]:
non_toxic_text = "#Person 1# tells Tommy that he didn't like the movie."

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors = 'pt').input_ids

logits = toxicity_model(input_ids = toxicity_input_ids).logits
print(f"logits[not hate, hate]: {logits.tolist()[0]}")

probabilities = logits.softmax(dim = -1).tolist()[0]
print(f"probabilities[not hate, hate]: {probabilities}")

not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f"reward (high): {not_hate_reward}")

In [None]:
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."

toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids

logits = toxicity_model(toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (low): {nothate_reward}')

In [None]:
device = 0 if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model = toxicity_model_name,
    device = device
)

reward_logits_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None,
    "function_to_apply": "softmax",
    "batch_size": 16
}

print("Reward model output:\n")
print("For non-toxic text")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("For toxic text")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

In [None]:
toxicity_evaluator = evaluate.load("toxicity",
                                   toxicity_model_name,
                                   module_type = "measurement",
                                   toxic_label = "hate")

In [None]:
toxicity_score = toxicity_evaluator.compute(predictions = [non_toxic_text])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions = [toxic_text])

print("Toxicity score for toxic text:")
print(toxicity_score["toxicity"])

In [None]:
def evaluate_toxicity(model,
                      toxicity_evaluator,
                      tokenizer,
                      dataset,
                      num_samples):

  max_new_tokens = 100

  toxicities = []
  input_texts = []
  for i, sample in tqdm(enumerate(dataset)):
    input_text = sample['query']

    if i > num_samples:
      break

    input_ids = tokenizer(input_text, return_tensors = "pt", padding = True).input_ids

    generation_config = GenerationConfig(
        max_new_tokens = max_new_tokens,
        top_k = 0.0,
        top_p = 1.0,
        do_sample = True
    )

    response_token_ids = model.generate(
        input_ids = input_ids,
        generation_config = generation_config
    )

    generated_text = tokenizer.decode(
        response_token_ids[0],
        skip_special_tokens = True
    )

    toxicity_score = toxicity_evaluator.compute(predictions = [(input_text + " " + generated_text)])

    toxicities.extend(toxicity_score["toxicity"])

    mean = np.mean('toxicities')
    std = np.std('toxicities')

    return mean, std

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = "auto")

mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model = ref_model,
                                                                          toxicity_evaluator = toxicity_evaluator,
                                                                          tokenizer = tokenizer,
                                                                          dataset = dataset['test'],
                                                                          num_samples = 10)

print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

In [None]:
def collator(data):
  return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

In [None]:
learning_rate = 1.41e-5
max_ppo_epochs = 1
mini_batch_size = 4
batch_size = 16

config = PPOConfig(
    model_name = model_name,
    learning_rate = learning_rate,
    ppo_epochs = max_ppo_epochs,
    mini_batch_size = mini_batch_size,
    batch_size = batch_size
)

ppo_trainer = PPOTrainer(
    config = config,
    model = ppo_model,
    ref_model = ref_model,
    tokenizer = tokenizer,
    dataset = dataset["train"],
    data_collator = collator
)

In [None]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    'min_length' = 5,
    'top_k' = 0.0,
    'top_p' = 1.0,
    'do_sample' = True
}

reward_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    'batch_size' = 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):

  if step >= max_ppo_steps:
    break

  prompt_tensors = batch['input_ids']

  summary_tensors = []

  for prompt_tensor in prompt_tensors:

    max_new_tokens = output_length_sampler()

    generation_kwargs['max_new_tokens'] = max_new_tokens
    summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

    summary_tensors.append(summary.squeeze()[-max_new_tokens:])

  batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

  query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]

  rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

  reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]

  stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
  ppo_trainer.log_stats(stats, batch, reward_tensors)

  print(f'objective/kl: {stats["objective/kl"]}')
  print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
  print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
  print('-'.join('' for x in range(100)))

In [None]:
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model,
                                                                        toxicity_evaluator=toxicity_evaluator,
                                                                        tokenizer=tokenizer,
                                                                        dataset=dataset["test"],
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

In [None]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

In [None]:
batch_size = 20
compare_results = {}

df_batch = dataset["test"][0: batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

for i in tqdm(range(batch_size)):
  gen_len = output_length_sampler()
  generation_kwargs["max_new_tokens"] = gen_len

  summary = ref_model.generate(
      input_ids = torch.as_tensor(prompt_tensors[i]).unsqueeze(dim = 0).to(device),
      **generation_kwargs
  ).squeeze[-gen_len:]

  summary_tensor_ref.append(summary)

  summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]

  summary_tensors.append(summary)

compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

In [None]:
pd.set_option('display_max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by = ["reward_diff"], ascending = False).reset_index(drop = True)
df_compare_results_sorted