In [1]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

from auto_gptq import exllama_set_max_input_length

  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.


In [2]:
config = PPOConfig(
    model_name="TheBloke/Swallow-13B-GPTQ",
    learning_rate=1.41e-5,
    # log_with="wandb",
	# batch_size=64,
	# mini_batch_size=64
)

# emotion pipe用の設定
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [3]:
# import wandb

# wandb.init()

In [4]:
def build_dataset(config, dataset_name="shunk031/wrime", ver="ver1", input_min_text_length=0, input_max_text_length=8):
	tokenizer = AutoTokenizer.from_pretrained(config.model_name)
	# tokenizer.pad_token = tokenizer.pad_token
	tokenizer.pad_token = tokenizer.eos_token

	ds = load_dataset(dataset_name, ver, split="train")
	ds = ds.remove_columns(["user_id", "datetime", "writer", "reader1", "reader2", "reader3", "avg_readers"])

	def tokenize(sample):
		stc_length = len(tokenizer.encode(sample["sentence"]))
		if stc_length < input_max_text_length:
			input_size = stc_length
		else :
			input_size = input_max_text_length
		sample["input_ids"] = tokenizer.encode(sample["sentence"])[: input_size]
		sample["query"] = tokenizer.decode(sample["input_ids"])
		return sample

	ds = ds.map(tokenize, batched=False)
	ds.set_format(type="torch")
	return ds

In [5]:
dataset = build_dataset(config)

tokenizer_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/914k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [6]:
len(dataset)

40000

In [7]:
def collator(data):
	return dict((key, [d[key] for d in data]) for key in data[0])

In [8]:
# Load LLM models
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, low_cpu_mem_usage=True, device_map="auto")
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name, low_cpu_mem_usage=True, device_map="auto")

model = exllama_set_max_input_length(model, max_input_length=2304)
ref_model = exllama_set_max_input_length(ref_model, max_input_length=2304)

tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


AttributeError: 'AutoModelForCausalLMWithValueHead' object has no attribute 'quantize_config'

In [None]:
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)
print(tokenizer.eos_token)
print(tokenizer.eos_token_id)

</s>
2
</s>
2


In [None]:
# initialize PPOTrainer
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

In [None]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
	device = 0 if torch.cuda.is_available() else "cpu"


In [None]:
emotion_pipe = pipeline("text-classification", model="/workspace/trl_example/emo_model/bert-large-japanese-v2/checkpoint-4276/", device=device)

In [None]:
text = "今日は残念ながら大雨で体育祭は中止です。"
emotion_pipe(text, **sent_kwargs)



[[{'label': 'JOY', 'score': -3.8448824882507324},
  {'label': 'SADNESS', 'score': 1.988978385925293},
  {'label': 'ANTICIPATION', 'score': -2.9459354877471924},
  {'label': 'SURPRISE', 'score': -3.4601120948791504},
  {'label': 'RAGE', 'score': -5.411185264587402},
  {'label': 'FEAR', 'score': -3.390770435333252},
  {'label': 'DISGUST', 'score': -3.706437826156616},
  {'label': 'TRUST', 'score': -5.7069501876831055}]]

In [None]:
# gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.pad_token_id}
# gen_kwargs = {"min_length": 0, "top_k": 50, "top_p": 0.95, "do_sample": True, "pad_token_id": tokenizer.pad_token_id}
gen_kwargs = {"min_length": 0, "top_k": 500, "top_p": 0.95, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [None]:
# select emotion
emotion = "JOY"

emotion_dict = {"JOY": 0,
    "SADNESS": 1,
    "ANTICIPATION": 2,
    "SURPRISE": 3,
    "RAGE": 4,
    "FEAR": 5,
    "DISGUST": 6,
    "TRUST": 7
	}

emotion_id = emotion_dict[emotion]

In [None]:
# output_min_length = 10
# output_max_length = 10
# output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    # "min_length": 0,
    "top_k": 500,
    "top_p": 0.95,
    "do_sample": True,
    # "pad_token_id": tokenizer.pad_token_id,
    "pad_token_id": tokenizer.eos_token_id,
    "temperature": 1
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from calm
    response_tensors = []
    for query in query_tensors:
        # gen_len = output_length_sampler()
        gen_len = 10
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        if len(response.squeeze()) < gen_len:
            gen_len = len(response.squeeze())
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = emotion_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[emotion_id]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    # ppo_trainer.log_stats(stats, batch, rewards)

0it [01:00, ?it/s]


RuntimeError: The temp_state buffer is too small in the exllama backend for GPTQ with act-order. Please call the exllama_set_max_input_length function to increase the buffer size for a sequence length >=2304:
from auto_gptq import exllama_set_max_input_length
model = exllama_set_max_input_length(model, max_input_length=2304)

In [None]:
#### get a batch from the dataset
bs = 32
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    # gen_len = output_length_sampler()
    gen_len = 10
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[emotion_id]["score"] for output in emotion_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[emotion_id]["score"] for output in emotion_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results



Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,眠い( ́д⊂,)\n今日こそお題通り、2時間,)\n「そんな顔してたの!?」ってびっくり,-2.820653,0.963286
1,なんやかんやで観光地なんよね,ー。\n南越前町も、福井県大,」\n「えっ? えっ?」\n「,-0.597175,1.345917
2,結構大きな地震来ましたね。最近また,大きな地震多いなぁ。今日はちょっと肌寒いかなぁ,地震が多くなってきた気がします。\nさて、そんな,-2.396218,-2.329535
3,Pさん、一からソロチャンネルってのはやっぱり,気心知れた人じゃないと難しいですからね。そういう,すごいよなー。\nえ? えっ?,-2.840656,1.780662
4,大吉先生かわいい,っす。\nさて、お次はT-ARA,!\nえー、もう10年以上も前,-3.223706,-0.879288
5,将棋の渡辺くん見てると、渡辺先生,ってほんとすごいよなぁと思う。\nあの年,「え!?お、また髪の毛切ったの!?,1.693428,1.775075
6,中学校の同級生が、お茶の品評会で,、県知事賞という最優秀な賞をとったという,金賞を受賞したという話をしていました。\nそのお茶は,-1.279851,0.183977
7,稲葉先生も見えてる・・・,\n2月10日~11日に毎年恒例の新,」「えっ!? えっ!? どういうこと!?」\n,-1.227304,1.544269
8,どんなにお金なくてもクヌルプに行きたいと,(もうすでに)思っていたから、そういう気持ちが少しでも,何度聞いたことか(笑)\n\nそして、,-1.823187,-0.600454
9,どーせ教育とか言って正当化するの,やめてほしいわな\nお前みたいな雑魚には,」\n「え、え? どうしてそんな,-3.721265,-1.485774


In [None]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)   -2.396147
rewards (after)    -0.031927
dtype: float64


median:


rewards (before)   -2.693935
rewards (after)     0.143005
dtype: float64

In [None]:
model.save_pretrained(f"/workspace/trl_example/save_dir/{config.model_name.split('/')[-1]}-{emotion}", push_to_hub=False)
tokenizer.save_pretrained(f"/workspace/trl_example/save_dir/{config.model_name.split('/')[-1]}-{emotion}", push_to_hub=False)

('/workspace/trl_example/save_dir/open-calm-1b-SURPRISE/tokenizer_config.json',
 '/workspace/trl_example/save_dir/open-calm-1b-SURPRISE/special_tokens_map.json',
 '/workspace/trl_example/save_dir/open-calm-1b-SURPRISE/tokenizer.json')