In [4]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from utils import CustomTokenizerForHumanPreference
import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast,BitsAndBytesConfig, PreTrainedTokenizerBase
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

assert torch.cuda.device_count() >= 2

In [5]:
@dataclass
class Config:
    model_dir = '../../gemma-2-9b-it-bnb-4bit'
    lora_dir = 'outputs/checkpoint-14226'
    max_length = 2400
    batch_size = 2
    device = torch.device("cuda")
    tta = True # 运行阶段prompt增强
    spread_max_length = False 
config = Config()

In [6]:
test = pd.read_csv("input/test.csv")

def process_text(text: str,prefix: str) -> str:
    list_of_text = eval(text,{"null":""})
    return " ".join([f'<{prefix}:{i+1}>:' + text for i,text in enumerate(list_of_text)])
test.loc[:,'prompt'] = test["prompt"].apply(lambda x : process_text(x,"prompt"))
test.loc[:,'response_a'] = test["response_a"].apply(lambda x : process_text(x,"response_a"))
test.loc[:,'response_b'] = test["response_b"].apply(lambda x : process_text(x,"response_b"))

def tokenize(
        tokenizer:PreTrainedTokenizerBase, prompt, response_a, response_b, max_length = config.max_length, spread_max_length = config.spread_max_length
):
    if spread_max_length:# prompt 和 response共用最大长度,并且最多都是用三分之一的长度
        prompt = tokenizer(prompt,max_length = max_length//3, truncation = True,padding = False).input_ids
        response_a = tokenizer(response_a,max_length = max_length//3, truncation = True,padding = False).input_ids
        response_b = tokenizer(response_b,max_length = max_length//3, truncation = True,padding = False).input_ids
        input_ids = [p + r_a + r_b for p,r_a,r_b in zip(prompt,response_a,response_b)]
        attention_mask = [[1]*len(i) for i in input_ids] #不padding的话，全部都是1，根据长度来
    else: 
        text = [p + r_a + r_b for p,r_a,r_b in zip(prompt,response_a,response_b)]
        tokenized = tokenizer(text,max_length=max_length,truncation=True,padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

tokenizer = GemmaTokenizerFast.from_pretrained(config.model_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data['id'] = test['id']
data['input_ids'],data['attention_mask'] = tokenize(tokenizer, test['prompt'],test['response_a'],test['response_b'])
data['length'] = data["input_ids"].apply(len)

swap_data = pd.DataFrame()
swap_data['id'] = test['id']
# 交换 response A 和 response B 以消除先后顺序影响
swap_data['input_ids'],swap_data['attention_mask'] = tokenize(tokenizer,test['prompt'],test['response_b'],test['response_a'])
swap_data['length'] = swap_data['input_ids'].apply(len) # apply就是对该列的每一行都进行该函数的操作

print(tokenizer.decode(data['input_ids'][0])) # 打印原顺序的语料

print(tokenizer.decode(swap_data['input_ids'][0])) # 打印交换顺序的语料

TypeError: Series.apply() missing 1 required positional argument: 'func'

In [7]:

device_0 = torch.device('cuda:0')
model_gemma_0 = Gemma2ForSequenceClassification.from_pretrained(
    config.model_dir,
    device_map = device_0,
    use_cache = False
)

device_1 = torch.device('cuda:1')
model_gemma_1 = Gemma2ForSequenceClassification.from_pretrained(
    config.model_dir,
    device_map = device_1,
    use_cache = False
)

model_gemma_0 = PeftModel.from_pretrained(model_gemma_0, config.lora_dir)
model_gemma_1 = PeftModel.from_pretrained(model_gemma_1, config.lora_dir)

@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=config.batch_size, max_length = config.max_length):
    a_win, b_win, tie = [], [], []
    for start_idx in range(0,len(df),batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp['input_ids'].to_list()
        attention_mask = tmp['attention_mask'].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids,"attention_mask":attention_mask},
            padding="longest", # pad到batch里最长的
            pad_to_multiple_of = None,
            return_tensors="pt"
        )
        outputs = model(**inputs.to(device))
        prob = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(prob[:,0].tolist())
        b_win.extend(prob[:,1].tolist())
        tie.extend(prob[:,2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

st = time.time()

data = data.sort_values("length", ascending=False) # 按照长度这一列进行降序排序，避免长短差距过大，导致过多pad
swap_data = swap_data.sort_values("length",ascending=False) # AB交换

sub_1 = data.iloc[0::2].copy() # 偶数
sub_2 = data.iloc[1::2].copy() # 奇数

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference,(sub_1,sub_2),(model_gemma_0,model_gemma_1),(device_0,device_1))

result_df = pd.concat(list(results),axis=0)
prob = result_df[['winner_model_a','winner_model_b',"winner_tie"]].values

print(f"time:{time.time()-st}")
result_df.loc[:,"winner_model_a"] = prob[:,0]
result_df.loc[:,"winner_model_b"] = prob[:,1]
result_df.loc[:,"winner_tie"] = prob[:,2]
submission_df = result_df[["id","winner_model_a","winner_model_b","winner_tie"]]
submission_df.to_csv("gemma2.csv",index=False)


   

KeyboardInterrupt: 