In [138]:
from comet_ml import Experiment

import transformers
import torch

devices = ["cuda:0", "cuda:1"]
device = devices[0]

In [None]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
# setting pad_token_id gets rid of a warning that shows up when calling .generate()
ref_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id).to(device)

In [195]:
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id).to(device)

In [151]:
def count_periods(sentences):
    return [sentence.count('.') for sentence in sentences]

In [201]:
sentence = "My feedback for the Machine Learning Alignment Bootcamp is"
tokens = tokenizer(sentence)
tokens = torch.tensor(tokens.input_ids, dtype=torch.long, device=device).unsqueeze(0)
output = model(tokens)

In [None]:
prediction = output.logits[0,-1].argmax(dim=-1)
print(tokenizer.decode([prediction]))
print(output.logits)

!
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward0>)


In [205]:
output = model.generate(tokens, min_length=120, max_length=120, top_k=len(tokenizer), temperature=0.6, do_sample=True)
print(tokenizer.decode(output.squeeze(0)))
# docs for generate(): 
# https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/model#transformers.generation_utils.GenerationMixin.generate

My feedback for the Machine Learning Alignment Bootcamp is so much wonderful, we began to work together with the community to build a robust, and scalable, performance-enhancing machine learning solution. The Machine Learning Alignment Bootcamp is an excellent introduction to machine learning, and we are thrilled to be involved in bringing it to you.

We are excited that we are able to deliver a truly comprehensive and comprehensive document on the tools and techniques that can help you explore the world of machine learning. This book will help you navigate through the world of machine learning to understand the basics of the world-


In [153]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
bert = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
bert.to(device)
bert.eval();

In [193]:
def get_positive_sentiment(strs):
    tokens = bert_tokenizer(strs, padding=True)
    tokens = torch.tensor(tokens.input_ids, dtype=torch.long, device=device)
    with torch.no_grad():
        output = bert(tokens)
    return output.logits.softmax(dim=-1)[:,1]

NameError: name 's' is not defined

In [None]:
NUM_GENERATED_TOKENS = 20 # num tokens generated in an episode
BATCH_SIZE = 50 # num episodes in a batch
LR = 3e-5
VOCAB_SIZE = 50257

PROMPT = "This is"
PROMPT_TOKENS = torch.tensor(tokenizer(PROMPT).input_ids, dtype=torch.long, device=device).unsqueeze(0)
PROMPT_BATCH = PROMPT_TOKENS.repeat(BATCH_SIZE, 1)

In [196]:
def train(
        model, 
        ref_model, 
        value_head, 
        num_epochs, 
        tokenizer, 
        prompt_batch, 
        lr, 
        num_generated_tokens, 
        kl_coefficient,
        entropy_coefficient,
        value_head_coefficient,
        # takes a list of sentences, returns score for each sentence
        sentence_reward_fn = count_periods,
        run_experiment = True,
):
    batch_size = prompt_batch.shape[0]
    all_params = list(model.parameters()) + list(value_head.parameters())
    optim = torch.optim.Adam(all_params, lr=lr)
    # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim)
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lambda x: x / num_epochs)
    model.train()
    value_head.train()
    ref_model.eval()
    
    if run_experiment:
        experiment = Experiment(
            api_key="72XQSdnwnBcob4Q8NpbJHewll",
            project_name="tom-dan",
            workspace="danielb",
        )
    for time_step in range(num_epochs):
        # Also possible to pass a single prompt with num_return_sequences=BATCH_SIZE
        # generated = model.generate(tokens, do_sample=True, max_new_tokens=num_generated_tokens).logits[0,-1].argmax(dim=-1)
        with torch.no_grad():
            generated = model.generate(prompt_batch, do_sample=True, top_k=VOCAB_SIZE, max_new_tokens=num_generated_tokens)
        # generated = model.generate(prompt_batch, do_sample=True, top_k=VOCAB_SIZE, max_new_tokens=num_generated_tokens, output_scores=True, return_dict_in_generate=True)
        generated_text = tokenizer.batch_decode(generated)
        rewards_per_batch = torch.tensor(sentence_reward_fn(generated_text), dtype=torch.float32, device=device)
        # normalize the rewards
        normalized_rewards_per_batch = (rewards_per_batch - rewards_per_batch.mean()) / (rewards_per_batch.std() + 1e-5)

        # Pass all tokens to next forward pass 
        full_output = model(generated, output_hidden_states=True)     
        
        output = full_output.logits
        output_softmax = torch.nn.functional.softmax(output, dim=-1)

        # predicted_probs[i, j] = output_softmax[i, j, generated[i, j+1]]
        #   = the output probability of the next token (for batch i, token j)
        predicted_probs = torch.gather(output_softmax[:,:-1], dim=2, index=generated[:,1:].unsqueeze(-1)).squeeze(-1)
        # cut off the prompt
        predicted_probs = predicted_probs[:,-num_generated_tokens:]
        log_probs = (predicted_probs + 1e-10).log()

        with torch.no_grad():
            ref_dist = torch.nn.functional.softmax(ref_model(generated).logits, dim=-1)

        #log_probs:           batch_size x num_generated_tokens
        #rewards_per_batch:   batch_size
        reward_per_action = log_probs * normalized_rewards_per_batch.unsqueeze(-1)
        policy_loss = -torch.mean(reward_per_action)
        
        log_ref_dist = (ref_dist + 1e-10).log()
        log_output_softmax = (output_softmax + 1e-10).log()
        kl = (ref_dist * log_ref_dist - ref_dist * log_output_softmax).sum(dim=-1).mean()
        entropy = -(output_softmax * log_output_softmax).sum(dim=-1).mean()

        value_estimate = value_head(full_output.hidden_states[-1]).squeeze(-1) # batch_size x seq_len
        # value_diffs = TD-1 error; batch_size x seq_len
        value_diffs = value_estimate.clone()
        value_diffs[:,:-1] -= value_estimate[:,1:].detach()
        value_diffs[:,-1] -= normalized_rewards_per_batch
        value_diffs = value_diffs[:,-num_generated_tokens:]
        value_loss = (value_diffs ** 2).mean()
        
        # value_diff_det = value_diffs.detach()
        # advantages = (value_diffs_det - value_diffs_det.mean(dim=-1).unsqueeze(-1)) / (value_diffs_det.std(dim=-1).unsqueeze(-1) + 1e-5)
        
        loss = policy_loss + value_head_coefficient * value_loss + kl_coefficient * kl + entropy_coefficient * -entropy
        optim.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(all_params, max_norm=1.0, error_if_nonfinite=True)
        optim.step()
        #lr_scheduler.step(rewards_per_batch.mean())
        lr_scheduler.step()
      
        if run_experiment:
            experiment.log_metric("loss", loss)
            experiment.log_metric("policy loss", policy_loss)
            experiment.log_metric("value head loss", value_loss)
            experiment.log_metric("unnormalized reward", rewards_per_batch.mean())
            experiment.log_metric("entropy", entropy)
            experiment.log_metric("kl", kl)
            experiment.log_metric("grad norm", grad_norm)
    
    if run_experiment:
        # this is a JupyterLab notebook so we have to explicitly end the experiment
        experiment.end()
        
reinitialize_model = True

if reinitialize_model:
    model = transformers.GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id).to(device)
    # attaches to last hidden output of GPT2
    value_head = torch.nn.Linear(768, 1).to(device)

train(
    model=model,
    ref_model=ref_model,
    value_head=value_head,
    tokenizer=tokenizer, 
    num_epochs=100, 
    prompt_batch=PROMPT_BATCH, 
    lr=LR, 
    num_generated_tokens=NUM_GENERATED_TOKENS,
    kl_coefficient=0.1,
    entropy_coefficient=0,
    value_head_coefficient=0,
    run_experiment = True,
    sentence_reward_fn=get_positive_sentiment,
)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/danielb/tom-dan/b1731d58d1dd4d7e97a16c0a4a348aa0

  rewards_per_batch = torch.tensor(sentence_reward_fn(generated_text), dtype=torch.float32, device=device)
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/danielb/tom-dan/b1731d58d1dd4d7e97a16c0a4a348aa0
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     entropy [100]             : (3.8491384983062744, 4.783637523651123)
COMET INFO:     grad norm [100]           : (6.400127410888672, 13.617693901062012)
COMET INFO:     kl [100]                  : (0.2592507004737854, 0.5355492234230042)
COMET INFO:     loss [110]                : (-0.48893728852272034, 0.29212549328804016)
COMET INFO:     policy loss [100]         : (-0.5347968339920044, 0.25495100021362305)
COMET INFO:     u