In [1]:
import os
import torch
import re
import json
# import gdown
from datasets import Dataset
import pandas as pd
from peft import LoraConfig,get_peft_model,PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, GenerationConfig,TrainerCallback
from tqdm.auto import tqdm
from trl import DPOTrainer,DPOConfig,CPOTrainer,CPOConfig
import random

In [2]:
train_dataset = Dataset.load_from_disk('autodl-tmp/dataset/rlhf_plus_c')

In [5]:
train_dataset[0]

{'user1': '#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "这款烟在100左右的档次个人感觉性价比不错，假如拿它和20，30一包的烟去比性价比，那也许没几个人会选它。所以个人感觉口味和包装是个人喜好，性价比应该和这个档次的烟比，应该在8分左右。"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "',
 'assistant1': '"分类标签": "A正面","理由": "评论者认为这款烟在其价格档次中性价比不错，给出了8分左右的高评价。虽然评论中提到了如果与更低价位的烟比较，其性价比可能不占优势，但整体上，评论者对产品的口味和包装表示满意，认为它符合个人喜好，因此整体语意偏向正面评价。"',
 'assistant2': '"分类标签": "C中性",\n"理由": "这段评论主要在讨论卷烟产品的性价比和与不同价格区间产品的比较，并给出了一个主观评分。评论者表达了个人对这款卷烟的看法，认为在100元左右的档次上，这款烟的性价比不错，适合与同档次的烟进行对比。虽然提到了个人口味和包装的喜好，但这些内容并未表达出明显的正面或负面情感倾向，因此可以归类为中性评价。"',
 'answer1': 'A正面',
 'answer2': 'C中性'}

In [4]:
def fix_format(x):
    #x['prompt'] = x['prompt'].replace('<|im_start|>system\n','').replace('<|im_end|>\n<|im_start|>user\n','').replace('<|im_end|>\n<|im_start|>assistant\n','').replace(', "理由": " "','?')
    
    # x['rejected'] = x['rejected'].replace('<|im_end|>\n','')
    # chosen_startwith = x['chosen'][0]
    # rejected_startwith = x['rejected'][0]
    # if chosen_startwith in 'ABCDE':
        
    #     x['chosen'] = '分类标签: '+x['chosen']
    # if rejected_startwith in 'ABCDE':
    #     x['rejected'] = '分类标签: '+x['rejected'] 
    x['prompt'] = "你是语义分类器,"+x['user1']
    x['chosen'] = x['chosen'] + '<|im_end|>'
    x['rejected'] = x['rejected'] + '<|im_end|>'
    
    return x

In [5]:
train_dataset = train_dataset.map(fix_format)

Loading cached processed dataset at /root/autodl-tmp/dataset/rlhf_plus_fff/cache-a77d60998c914659.arrow


In [6]:
train_dataset[:10]

{'prompt': ['<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "比五星红杉树要好抽\\xa0一点，有点苏烟的味道"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n',
  '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "苏州地区无售,本人有幸收藏了一包。\\xa0包装精致。"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n',
  '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "很不错，个人感觉比芙蓉王和22的利群都好抽"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n',
  '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "都是吸这个烟。口感还不错"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n',
  '<|i

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    'autodl-tmp/merge_model',
    device_map='auto',
    trust_remote_code=True,
    # torch_dtype=torch.bfloat16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    )
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RM

In [9]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    inference_mode = False,
)

In [10]:
model = get_peft_model(model,peft_config)

In [11]:
tokenizer = AutoTokenizer.from_pretrained('autodl-tmp/merge_model', trust_remote_code=True)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2TokenizerFast(name_or_path='autodl-tmp/merge_model', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
# tokenizer.eos_token = tokenizer.pad_token
if tokenizer.bos_token is None:   # qwen没有bos_token，要设置一下，不然dpo train时会报错。
    tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
    tokenizer.bos_token_id = tokenizer.eos_token_id

In [13]:
tokenizer

Qwen2TokenizerFast(name_or_path='autodl-tmp/merge_model', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_end|>', 'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [14]:
training_args = DPOConfig(
    output_dir="autodl-tmp/rlhf_0707",
    beta=0.1,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=False,
    # learning_rate=2e-4,
    # optim="paged_adamw_8bit",
    logging_steps = 10,
    warmup_ratio = 0.1,
    report_to = 'none',
    max_length=512,
    max_prompt_length=64
    # group_by_length = True,
    # use_gradient_checkpointing = True,
    # loss_type=training_args.loss_type,
)

In [15]:
tokenizer.encode(train_dataset[0]['prompt'],return_tensors='pt')

tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,      2,  88802,   5122,  87752,
         106273, 100199,  99752, 104703,  85641,   3837, 101892,  70538, 105151,
          43815,  72881,  64559,  71817, 103964,   3837,  98237,  30534,  66394,
         102401,      2,    220,  85641,   5122,  19417,    284,    330,  56006,
         110937,  99425, 103268,  99613,  30534,  52801,  99950,  46396,     15,
         100380,   3837, 104037,  99908,  99752, 107254,  57676,  70568,  68805,
           5122,    330,  70538, 105151,    788,    330,     32, 106557,   3837,
             33, 103276,   3837,     34,  15946,  33071,   3837,     35, 115469,
           3837,     36,  42192,  82700, 103964,  72881,  36589,    497,    330,
         102401,    788,    330,    330, 151645,    198, 151644,  77091,    198]])

In [16]:
tokenizer.decode(tokenizer.encode(train_dataset[0]['prompt'],return_tensors='pt')[0])

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "比五星红杉树要好抽\\xa0一点，有点苏烟的味道"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n'

In [22]:
class MyCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            print(state.global_step)

            data = random.choice(train_dataset)
            input_ids = tokenizer.encode(data['prompt'],
                                         return_tensors='pt').to('cuda')
            # print(input_ids[0])
            out = model.generate(input_ids,
                                pad_token_id=tokenizer.pad_token_id,
                                eos_token_id=tokenizer.eos_token_id,
                                max_new_tokens=64)
            print(tokenizer.decode(out[0]))
            print('='*20)
            print(data['chosen'])
            print(data['rejected'])
            print('='*20)      

In [23]:
dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    # max_length=512,
    callbacks=[MyCallback]
)

Loading cached processed dataset at /root/autodl-tmp/dataset/rlhf_plus_fff/cache-4475eacdaedac430.arrow


In [24]:
#print(dpo_trainer.tokenize_row(train_dataset[0]))

In [25]:
tokenizer.decode(dpo_trainer.tokenize_row(train_dataset[0])['chosen_input_ids'],skip_special_tokens=False)

'<|im_end|><|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "比五星红杉树要好抽\\xa0一点，有点苏烟的味道"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n"分类标签": "A正面","理由": "评论中提到该卷烟产品在口感上优于另一知名品牌（五星红杉树），并带有另一品牌（苏烟）的味道，这通常被视为对该产品的正面评价。"<|im_end|>'

In [26]:
dpo_trainer.train()

Step,Training Loss
10,0.0004
20,0.0001
30,0.0001
40,0.0001
50,0.0
60,0.0001
70,0.0
80,0.0
90,0.0
100,0.0


100
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "出去的时候那盒"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>
<|im_start|>assistant
"分类标签":"C中性","理由": "该评论'出去的时候那盒'缺乏明确的产品评价信息，它没有表达出明显的正面或负面情感，也没有讽刺意味，更没有直接提及产品的任何特点。这句话看起来更像是一个无关紧要的陈述，因此它被归类为中
"分类标签": "C中性","理由": "评论中的内容\'出去的时候那盒\'并没有明确的情感色彩，也没有直接提及产品的好坏，因此无法判断为正面或负面评价。它更像是一个没有上下文的短语，不具备明显的评价意义。"<|im_end|>
"分类标签": "B负面",
"理由": "评论中的'出去的时候那盒'这句话可能暗示了评论者在外出时携带了一盒卷烟，但并没有提供关于产品质量、服务或其他与产品相关的具体信息。因此，这个评论更多的是描述了一个行为或情境，而不是对产品本身的评价，倾向于表达的是一个负面的情绪或不便，可能是由于携带卷烟而引起的某种不愉快经历。"<|im_end|>
200
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "很好，甜，香，爱不释手啊。"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>
<|im_start|>assistant
"<|im_end|>
"分类标签": "A正面","理由": "评论中使用了‘很好’，‘甜’，‘香’，‘爱不释手’等积极的词汇，明显表达了对卷烟产品的喜爱和满意，属于正面评价。"<|im_end|>
根据给出的评论内容"很好，甜，香，爱不释手啊。"，我们可以分析其语义情感和意图：

1.



600
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "大众畅销货在我们这儿"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>
<|im_start|>assistant
"分类标签":"A正面","理由": "该评论没有明确的正面或负面情感色彩，只是简单地陈述了该产品在当地市场的销售情况，没有表达出对产品的具体喜好或不满。""<|im_end|>
"分类标签": "A正面","理由": "评论中提到‘大众畅销货’，通常这样的描述意味着产品在当地很受欢迎，具有积极的市场接受度，因此可以归类为正面评价。"<|im_end|>
"分类标签": "C中性",
"理由": "评论仅提到'大众畅销货'和'在我们这儿'，没有明确表达对产品质量或体验的正面或负面看法。因此，这可以被归类为中性评价。"<|im_end|>
700
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "赃朋都好这口!"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>
<|im_start|>assistant
"分类标签":"A正面","理由": "评论中的‘赃朋都好这口’是一种方言表达，通常在中文里这种表述带有积极的情感色彩，意味着该产品被某一群体所喜爱，尽管语言有些模糊，但整体上可以理解为正面的评价。""<|im_end|>
"分类标签": "A正面","理由": "评论中的‘赃朋都好这口’表达了一种正面的社交认同感，意味着这个卷烟产品受到了一群朋友的喜爱和认可。‘好这口’通常指对某事物有喜好，在这里暗示了对该卷烟的正面评价。"<|im_end|>
"分类标签": "D讽刺",
"理由": "评论中的'赃朋都好这口!'使用了反语或讽刺的表达方式，

TrainOutput(global_step=972, training_loss=1.3155831788742769e-05, metrics={'train_runtime': 5762.1347, 'train_samples_per_second': 1.351, 'train_steps_per_second': 0.169, 'total_flos': 0.0, 'train_loss': 1.3155831788742769e-05, 'epoch': 1.0})

In [27]:
dpo_trainer.save_model('autodl-tmp/dpo_qwen_plus_0707')



In [28]:
tokenizer.save_pretrained('autodl-tmp/dpo_qwen_plus_0707')

('autodl-tmp/dpo_qwen_plus_0707/tokenizer_config.json',
 'autodl-tmp/dpo_qwen_plus_0707/special_tokens_map.json',
 'autodl-tmp/dpo_qwen_plus_0707/vocab.json',
 'autodl-tmp/dpo_qwen_plus_0707/merges.txt',
 'autodl-tmp/dpo_qwen_plus_0707/added_tokens.json',
 'autodl-tmp/dpo_qwen_plus_0707/tokenizer.json')

In [28]:
train_dataset['prompt'][0]

'你是一个语义分类器.#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，# 评论：review = "比五星红杉树要好抽\\xa0一点，有点苏烟的味道"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意"?'

In [22]:
inputs_ids = tokenizer(train_dataset['prompt'][3],return_tensors='pt').to('cuda')


In [23]:
tokenizer.decode(inputs_ids['input_ids'][0], skip_special_tokens=False)

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "都是吸这个烟。口感还不错"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n'

In [24]:
generated_ids = model.generate(
        **inputs_ids,
        # max_new_tokens=512,
        max_new_tokens=128,
        temperature=0.3,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

In [25]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=False)

['<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，简要说明理由# 评论：review = "都是吸这个烟。口感还不错"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意", "理由": " "<|im_end|>\n<|im_start|>assistant\n"分类标签":"A正面","理由": "评论中提到‘口感还不错’，表达了对卷烟产品正面积极的评价，没有负面或讽刺的语气，属于正面评价。""<|im_end|>\n<|endoftext|>']

In [27]:
val_dataset = Dataset.load_from_disk('autodl-tmp/dataset/datasets_val')
val_dataset

Dataset({
    features: ['instruction', 'input', 'response_total', 'response'],
    num_rows: 36869
})

In [32]:
def process_func_val(example):
    MAX_LENGTH = 386
    input_ids, attention_mask, labels = [], [], []
    
    instruction = (example['input']).replace("\'\'\'","").replace('简要说明理由','不需要说明理由').replace(', "理由": " "','?')
    # instruction = instruction.replace("A正面，B负面，C中性，D讽刺，E无产品评价语意",select)
    
    return {'prompt': instruction,  'answer': example['response']}

In [33]:
tokenized_val = val_dataset.map(process_func_val, remove_columns=val_dataset.column_names)
tokenized_val

Map:   0%|          | 0/36869 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'answer'],
    num_rows: 36869
})

In [34]:
tokenized_val[0]

{'prompt': '#任务：以下是对卷烟产品的评论，按照分类标签内容语义进行评价，不需要说明理由# 评论：review = "太难抽，又苦又涩"# 输出格式： "分类标签": "A正面，B负面，C中性，D讽刺，E无产品评价语意"?',
 'answer': 'B负面'}

In [35]:
batchs = []
for i in range(len(tokenized_val)):
    s = tokenized_val[i]['prompt']
    # print(s)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": s}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    
    )
    batchs.append(text)

In [36]:
from tqdm import tqdm

In [40]:
tokenizer.padding_side='left'

In [41]:
responses = []
for i in tqdm(range(0,len(tokenized_val),8)):
    # use batch
    start = i
    end = i + 8
    if end >= len(batchs):
        end = len(batchs)
    s = batchs[start:end]
    
    model_inputs = tokenizer(s, return_tensors="pt",padding=True).to('cuda')
    generated_ids = model.generate(
        model_inputs.input_ids,
        # max_new_tokens=512,
        max_new_tokens=32,
        temperature=0.5,
        pad_token_id=tokenizer.eos_token_id,
        do_sample = True
        
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    # print(response)
    responses.extend(response)

100%|██████████| 4609/4609 [1:57:13<00:00,  1.53s/it]  


In [42]:
dct = {'rlhf':responses}

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(dct).to_csv('fine_turning_dpo_last_version.csv',encoding='utf-8-sig')