# Imports and configs

In [1]:
pip install transformers peft accelerate bitsandbytes --no-index --find-links=/kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/packages

Looking in links: /kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/packages
Processing /kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/packages/peft-0.14.0-py3-none-any.whl
Processing /kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/packages/bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl
Processing /kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/packages/huggingface_hub-0.27.1-py3-none-any.whl (from transformers)
Installing collected packages: huggingface-hub, bitsandbytes, peft
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.24.7
    Uninstalling huggingface-hub-0.24.7:
      Successfully uninstalled huggingface-hub-0.24.7
Successfully installed bitsandbytes-0.45.0 huggingface-hub-0.27.1 peft-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from concurrent.futures import ThreadPoolExecutor
from peft import PeftModel
import pandas as pd
import numpy as np
import torch

In [3]:
class CFG:
    test_path = "/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet"
    
    gemma_dir = "/kaggle/input/gemma-2-9b-it-bnb-4bit-unsloth/transformers/default/1"
    lora_dir = "/kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/fold 0/gemma2-9b-4bit-2048-2-16-fold0/checkpoint-12000"
    
    max_length = 2048
    batch_size = 4

# Loading data

In [4]:
test = pd.read_parquet(CFG.test_path)

# Tokenizing

In [5]:
def tokenize(tokenizer, prompt, response_a, response_b, max_length=CFG.max_length):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    
    text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
    tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
    
    input_ids = tokenized.input_ids
    attention_mask = tokenized.attention_mask
    
    return input_ids, attention_mask

In [6]:
tokenizer = GemmaTokenizerFast.from_pretrained(CFG.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

In [7]:
data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
aug_data["input_ids"], aug_data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

# Model

In [8]:
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    CFG.gemma_dir,
    device_map=torch.device("cuda:0"),
    use_cache=False,
)


model_1 = Gemma2ForSequenceClassification.from_pretrained(
    CFG.gemma_dir,
    device_map=torch.device("cuda:1"),
    use_cache=False,
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-it-bnb-4bit-unsloth/transformers/default/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-it-bnb-4bit-unsloth/transformers/default/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model_0 = PeftModel.from_pretrained(model_0, CFG.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, CFG.lora_dir)

# Inference

In [10]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=CFG.batch_size):
    p_a, p_b = [], []

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        
        outputs = model(**inputs.to(device))
        tmp_y_pred_probs = outputs.logits.softmax(-1).cpu()

        p_a.extend(tmp_y_pred_probs[:, 0].tolist())
        p_b.extend(tmp_y_pred_probs[:, 1].tolist())
        
    df["p_a"], df["p_b"] = p_a, p_b

    return df

  @torch.cuda.amp.autocast()


In [11]:
data = data.sort_values("length", ascending=False)

sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(
        inference, 
        (sub_1, sub_2), 
        (model_0, model_1), 
        (torch.device("cuda:0"), torch.device("cuda:1"))
    )

result_df = pd.concat(list(results), axis=0)
y_pred_probs = result_df[["p_a", "p_b"]].values

# Submission

In [12]:
sub = result_df[["id", "p_a", "p_b"]].copy()

sub["winner"] = np.where(sub["p_a"] > 0.5, "model_a", "model_b")
sub = sub.drop(columns=["p_a", "p_b"], axis=1)
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,id,winner
0,327228,model_b
1,1139415,model_a
2,1235630,model_b
