# Tune BLIP model to Generate Negative Responses

Fine-tune a BLIP model to produce negative responses. A BERT sentiment classifier is used as a reward function. The BLIP model is trained with PPO using the classifier's reward signal. 

In [50]:
import torch
from tqdm import tqdm
import pandas as pd

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [None]:
config = PPOConfig(
    model_name="Salesforce/blip-vqa-base",
    learning_rate=1e-4,
    log_with="wandb",
    steps=256,
)



In [52]:
import wandb 

wandb.init(project="ai-dev", config=config)

## Load Data and Models
### Load IMDB Dataset
Now, we will build the dataset for training. The dataset consists of the starting few words from IMDB reviews. The IMDB dataset contains 50k movie review annotated with "positive"/"negative" feedback. First, we filter out comments that are longer than 200 characters and take starting text with token size between 2 to 8.   

In [53]:
def build_dataset(config):
    dataset_name = "stanfordnlp/imdb"
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    ds = load_dataset(dataset_name)
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
    
    input_size = LengthSampler(2, 8)
    
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[:input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [54]:
dataset = build_dataset(config)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map:   0%|          | 127/49776 [00:00<01:20, 620.14 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 49776/49776 [01:30<00:00, 549.34 examples/s]


### Load Pre-trained GPT2 models

Load GPT2 twice. First is optimized, second is reference model for KL-divergence. 

In [55]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

### Load BERT Classifier 
We load a BERT classifier fine-tuned on IMDB dataset. 

In [56]:
device = 0 if torch.cuda.is_available() else "cpu"
sentiment_pipe = pipeline(
    "sentiment-analysis", model="lvwerra/distilbert-imdb", device=device
)
sent_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}

In [57]:
text = "I found the acting, direction and story of this movie terrible. There was a boring vibe all along."
sentiment_pipe(text, **sent_kwargs)

[{'label': 'NEGATIVE', 'score': 2.5600204467773438},
 {'label': 'POSITIVE', 'score': -2.9452600479125977}]

In [58]:
text = "It was an astonishing one. The plot had excitement and unpredictability. The graphics were carefully crafted."
sentiment_pipe(text, **sent_kwargs)

[{'label': 'POSITIVE', 'score': 2.2316293716430664},
 {'label': 'NEGATIVE', 'score': -1.9940400123596191}]

In [59]:
def get_sent_scores(queries, responses):
    texts = [q + r for q, r in zip(queries, responses)]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    neg_scores = [item["score"] for output in pipe_outputs for item in output if item["label"] == "NEGATIVE"]
    return neg_scores

## Optimize Model

### Initialize PPOTrainer

In [60]:
from torch.utils.data import Subset

subset_length = 3000
indices = list(range(subset_length))
subset = Subset(dataset["train"], indices) # dataset has "train", "test", and "unsupervised"

ppo_trainer = PPOTrainer(
    config, model, ref_model, tokenizer, dataset=subset, data_collator=collator
)



### Training Loop
Training loop consists of three main steps:
1. Get query responses from policy network (GPT-2)
2. Get sentiments for query/responses from BERT 
3. Optimize policy with PPO using the (query, response, reward) triplet

In [61]:
output_min_len = 10
output_max_len = 20

output_length_sampler = LengthSampler(output_min_len, output_max_len)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

for epoch, batch in enumerate(tqdm(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]
    
    # Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs).squeeze()
        response_tensors.append(response[len(query):])
    batch["response"] = [tokenizer.decode(response) for response in response_tensors]
    
    # Compute sentiment score
    scores = get_sent_scores(batch["query"], batch["response"])
    rewards = [torch.tensor(score) for score in scores]
    
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

100%|██████████| 23/23 [10:53<00:00, 28.42s/it]


## Model Inspection

In [62]:
import random
# get a batch from the dataset
bs = 16
gen_reviews = dict()
# subset.set_format("pandas")
df_batch = subset.dataset[random.sample(range(len(subset.dataset)), bs)]  # subset consists of dataset and indices
gen_reviews["query"] = df_batch["query"]
query_tensors = df_batch["input_ids"]

response_tensors_ref, response_tensors = [], []

# get response from gpt2 and gpt2_ref
for i in range(bs):
    query = torch.tensor(query_tensors[i]).to(device)

    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    query_response = ref_model.generate(
        query.unsqueeze(0), **generation_kwargs
    ).squeeze()
    response_len = len(query_response) - len(query)
    response_tensors_ref.append(query_response[-response_len:])

    query_response = model.generate(
        query.unsqueeze(0), **generation_kwargs
    ).squeeze()
    response_tensors.append(query_response[len(query):])
    
# decode responses
gen_reviews["response (before)"] = [
    tokenizer.decode(response_tensors_ref[i]) for i in range(bs)
]
gen_reviews["response (after)"] = [
    tokenizer.decode(response_tensors[i]) for i in range(bs)
]

# sentiment analysis of query/response pairs before and after 
scores_before = get_sent_scores(gen_reviews["query"], gen_reviews["response (before)"])
scores_after = get_sent_scores(gen_reviews["query"], gen_reviews["response (after)"])
gen_reviews["rewards (before)"] = scores_before
gen_reviews["rewards (after)"] = scores_after

df_results = pd.DataFrame(gen_reviews)
df_results

  query = torch.tensor(query_tensors[i]).to(device)


Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,The word 'classic',"doesn't really suffice, since this film is fa...",- deperson and things to elevate the clichés ...,1.802846,-0.07348
1,"What an inspiring movie, I laughed",out loud.<|endoftext|>,out loud a few times . I shouldn't have; I di...,-1.827979,-1.276709
2,For people interested in business,"or WWE expertise, visit www.wwea.com (if not ...","careers such as Donald Trump, many of the exa...",-0.50588,-1.060472
3,Eytan,""" and a fellow balloted he is son of Jimi Lupi...",the Cruel in prison (which did not even deser...,-0.501725,1.110715
4,This may actually be,"true"" it had briefly touched an emotional cho...","a bit lame this time, especially those inevit...",-1.307753,1.274283
5,this film takes you inside,"a forest, services massravaged romance betwee...","the pansies of 45 - 50 bumish people, giving ...",-0.80307,-0.263245
6,Guys and Dolls is,a dual 'brilliant' film of the people that will,a let down. Colon is a garbage and inconceiva...,-2.425464,2.445132
7,Julian Noble (Pierce Bros,nan) Wickford Lane (Charles Durning,"nan in ""Donnie Capote"") talks as",0.060136,-0.824049
8,You want,to fight back on your true respect for the so...,? Do not bother. If you love the movie,-0.953021,-0.390523
9,You probably all already know this,", but I wasted my hard earned sleep for it.<br...",from movies like a made or planned or repeate...,2.054764,2.273826


The result shows that negative score (reward) increased for most generated reviews.