# Packages

In [1]:
%pip install pip==23.3.2

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: triton
Successfully installed triton-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl

Processing /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl
Installing collected packages: xformers
Successfully installed xformers-0.0.24+042abc8.d20240802
Note: you may need to restart the kernel to use updated packages.


In [4]:
!cp -r /kaggle/input/lmsys-modules-0805 human_pref

In [5]:
!pip install peft==0.11.1

Collecting peft==0.11.1
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1


In [6]:
%%writefile human_pref/data/processors.py
import json

import torch


class ProcessorPAB:
    PROMPT_PREFIX = """Please act as an impartial judge and evaluate the quality of the responses provided by two
AI assistants to the user question displayed below. You should choose the assistant that
follows the user’s instructions and answers the user’s question better. Your evaluation
should consider factors such as the helpfulness, relevance, accuracy, depth, creativity,
and level of detail of their responses. Begin your evaluation by comparing the two
responses and provide a short explanation. Avoid any position biases and ensure that the
order in which the responses were presented does not influence your decision. Do not allow
the length of the responses to influence your evaluation. Do not favor certain names of
the assistants. Be as objective as possible. After providing your explanation, output your
final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]"
if assistant B is better, and "[[C]]" for a tie."""

    PROMPT_SUFFIX = "verdict is: [["

    LABEL_COLS = ["winner_model_a", "winner_model_b", "winner_tie"]

    def __init__(self, tokenizer, max_length, support_system_role):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.support_system_role = support_system_role

    def build_conversation(self, prompts, responses_a, responses_b):
        head = "<|The Start of Conversation between a User and two Assistants|>"
        tail = "<|The End of Conversation between a User and two Assistants|>\n"
        parts = []
        for prompt, response_a, response_b in zip(prompts, responses_a, responses_b):
            if prompt is None:
                prompt = "null"
            if response_a is None:
                response_a = "null"
            if response_b is None:
                response_b = "null"
            parts.append(
                f"\n### User:\n{prompt}\n\n### Assistant A:\n{response_a}\n\n### Assistant B:\n{response_b}\n"
            )
        text = "".join(parts)
        input_ids = self.tokenizer(
            text,
            add_special_tokens=False,
            max_length=self.max_length,
            truncation=True,
        ).input_ids

        truncated_text = self.tokenizer.decode(input_ids)
        return head + truncated_text + tail

    def build_input(self, data):
        conversation = self.build_conversation(
            [data["prompt"]],
            [data["response_a"]],
            [data["response_b"]],
        )
        if self.support_system_role:
            messages = [
                {"role": "system", "content": self.PROMPT_PREFIX},
                {"role": "user", "content": conversation},
            ]
        else:
            messages = [
                {"role": "user", "content": f"{self.PROMPT_PREFIX}\n{conversation}"},
            ]
        input_text = (
            self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )
            + self.PROMPT_SUFFIX
        )
        input_ids = self.tokenizer(
            input_text,
            add_special_tokens=False,
            return_tensors="pt",
        ).input_ids[0]
        label = torch.tensor([data[col] for col in self.LABEL_COLS]).float()
        return dict(
            input_ids=input_ids,
            input_text=input_text,
            label=label,
        )

Overwriting human_pref/data/processors.py


# Prepare test file

In [7]:
%%writefile prepare_test_file.py
import pandas as pd
import os
IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
print("IS_SUBMISSION:", IS_SUBMISSION)

if IS_SUBMISSION:
    df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet")
else:
    df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")
    df = df.sample(n=100, random_state=42)

df["winner_model_a"] = 1
df["winner_model_b"] = 0
df["winner_tie"] = 0
df.to_parquet("test.parquet", index=False)
df["response_a"], df["response_b"] = df["response_b"], df["response_a"]
df.to_parquet("test_swap.parquet", index=False)

Writing prepare_test_file.py


In [8]:
!python prepare_test_file.py

IS_SUBMISSION: False


# Inference: gemma2-9b

In [9]:
%%writefile predict_m0.py
import time
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
from peft import PeftModel

from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device

# Configuration
base_model_path = "/kaggle/input/gemma-2/transformers/gemma-2-9b-it/2"  # Base model path
lora_path = "/kaggle/input/o_gemma-2-9b-it-4bit_trained_with_bigdata_v2/transformers/default/4"  # LoRA adapter path
csv_path = "test.parquet"

# Initialize tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=False,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# Device mapping configuration
num_hidden_layers = 42
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

# Load base model with device placement
base_model = Gemma2ForSequenceClassification.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Load LoRA adapters
model = PeftModel.from_pretrained(base_model, lora_path)

# # Verify device placement
# for name, param in model.named_parameters():
#     if "lora" in name.lower():
#         layer_num = int(name.split(".")[3])  # Extract layer number from param name
#         device = "cuda:0" if layer_num < num_hidden_layers//2 else "cuda:1"
#         assert param.device == torch.device(device), f"Parameter {name} on wrong device"

# Prepare frequency buffers
config = model.config
dim = config.head_dim
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")

# Pipeline inference
t1 = time.time()
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
        )
        seq_info = to_device(seq_info, "cuda:0")
        
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
            
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# Process final batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

# Post-process results
pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))
np.save('prob_m0.npy', prob)

# Timing results
t2 = time.time()
print(f"elapsed time: {t2 - t1}s")
print(f"elapsed time 10k: {(t2 - t1) * 100 / 3600}h")
print(f"elapsed time 25k: {(t2 - t1) * 250 / 3600}h")

Writing predict_m0.py


In [10]:
!python predict_m0.py

NVIDIA L4 with CUDA capability sm_89 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_60 sm_70 sm_75 compute_70 compute_75.
If you want to use the NVIDIA L4 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:18<00:00,  4.56s/it]
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2/transformers/gemma-2-9b-it/2 and are newly initialized: ['model.layers.0.mlp.gate_up_proj.weight', 'model.layers.1.mlp.gate_up_proj.weight', 'model.layers.10.mlp.gate_up_proj.weight', 'model.layers.11.mlp.gate_up_proj.weight', 'model.layers.12.mlp.gate_up_proj.weight', 'model.layers.13.mlp.gate_up_proj.weight', 'model.layers.14.mlp.gate_up_proj.weight', 'model.layers.15.mlp.gate_up_proj.weight', 'model.layers.16.mlp.gate_up_proj.weight', 'model.layers.17.mlp.gate_up_proj.we

In [11]:
# %%writefile predict_m0.py
# import time
# import torch
# import numpy as np
# from torch.utils.data import DataLoader
# from tqdm import tqdm
# from transformers import AutoTokenizer

# # from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
# from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
# from human_pref.data.processors import ProcessorPAB
# from human_pref.data.dataset import LMSYSDataset
# from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
# from human_pref.utils import to_device


# model_name_or_path = "/kaggle/input/o_gemma-2-9b-it-4bit_trained_with_bigdata_v2/transformers/default/2"
# csv_path = "test.parquet"

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# processor = ProcessorPAB(
#     tokenizer=tokenizer,
#     max_length=4096,
#     support_system_role=False,
# )
# dataset = LMSYSDataset(
#     csv_file=csv_path,
#     query=None,
#     processor=processor,
#     include_swap=False,
#     is_parquet=True,
# )
# dataloader = DataLoader(
#     dataset,
#     batch_size=80,
#     num_workers=4,
#     collate_fn=ShardedMaxTokensCollator(
#         max_tokens=8192, base_collator=VarlenCollator()
#     ),
# )

# # model for pipelined inference
# num_hidden_layers = 42
# device_map = {
#     "model.embed_tokens": "cuda:0",
#     "model.norm": "cuda:1",
#     "score": "cuda:1",
# }
# for i in range(num_hidden_layers // 2):
#     device_map[f"model.layers.{i}"] = "cuda:0"
# for i in range(num_hidden_layers // 2, num_hidden_layers):
#     device_map[f"model.layers.{i}"] = "cuda:1"

# model = Gemma2ForSequenceClassification.from_pretrained(
#     model_name_or_path,
#     torch_dtype=torch.float16,
#     device_map=device_map,
# )

# # inv_freq clones for each device
# config = model.config
# dim = config.head_dim
# inv_freq = 1.0 / (
#     config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
# )
# inv_freq0 = inv_freq.to("cuda:0")
# inv_freq1 = inv_freq.to("cuda:1")


# # for name, p in model.named_parameters():
# #     print(name, p.device)
# # for name, b in model.model.named_buffers():
# #     print(name, b.device)

# # pipeline parallelism with two GPUs
# t1 = time.time()
# is_first = True
# hidden_states = None
# outs = []
# for batch in tqdm(dataloader):
#     for micro_batch in batch:
#         input_ids = to_device(micro_batch["input_ids"], "cuda:0")
#         seq_info = dict(
#             cu_seqlens=micro_batch["cu_seqlens"],
#             position_ids=micro_batch["position_ids"],
#             max_seq_len=micro_batch["max_seq_len"],
#             # attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
#         )
#         seq_info = to_device(seq_info, "cuda:0")
#         if is_first:
#             with torch.no_grad(), torch.cuda.amp.autocast():
#                 prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
#             is_first = False
#             prev_seq_info, prev_hidden_states = to_device(
#                 [seq_info, prev_hidden_states], "cuda:1"
#             )
#             continue
#         with torch.no_grad(), torch.cuda.amp.autocast():
#             logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
#             hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

#             prev_seq_info, prev_hidden_states = to_device(
#                 [seq_info, hidden_states], "cuda:1"
#             )
#             outs.append(logits.cpu())

# # last micro-batch
# with torch.no_grad(), torch.cuda.amp.autocast():
#     logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
#     outs.append(logits.cpu())

# pred = torch.cat(outs, dim=0)
# prob = pred.softmax(-1)
# print(dataset.evaluate(prob.numpy()))

# np.save('prob_m0.npy', prob)

# t2 = time.time()
# print(f"elapsed time: {t2 - t1}s")
# print(f"elapsed time 10k: {(t2 - t1) * 100 / 3600}h")
# print(f"elapsed time 25k: {(t2 - t1) * 250 / 3600}h")

In [12]:
# !python predict_m0.py

# Inference: llama3-8b

In [13]:
%%writefile predict_m3.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_llama import LlamaForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/lmsys-checkpoints-3-0805"
csv_path = "test_swap.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.deprecation_warnings[
    "sequence-length-is-longer-than-the-specified-maximum"
] = True
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=True,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 32
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = LlamaForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.hidden_size // config.num_attention_heads
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())


pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m3.npy', prob)

Writing predict_m3.py


In [14]:
# !python predict_m3.py

# Make submission

In [15]:
%%writefile make_submission.py
import numpy as np
import pandas as pd
import os
IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

df = pd.read_parquet("test.parquet")
# preds = np.average(
#     [
#         np.load("prob_m0.npy"),
#         np.load("prob_m3.npy")[:, [1, 0, 2]],
#     ],
#     axis=0,
#     weights=[2, 1],
# )

preds = np.load("prob_m0.npy")
winters = []
for i in range(len(preds)):
    winters.append("model_a" if  preds[i, 0] > preds[i, 1] else "model_b")

sub = pd.DataFrame({
    "id": df["id"],
    "winner": winters
})
sub.to_csv("submission.csv", index=False)

if not IS_SUBMISSION:
    correct_preds = (df['winner'] == sub['winner']).sum()
    total_preds = len(df)
    acc = correct_preds / total_preds
    print(f"Accuracy: {acc}")

print(sub.head())

Writing make_submission.py


In [16]:
!python make_submission.py

Accuracy: 0.62
                                                  id   winner
0  98aa6a2c252836e0f5fb4412f56e711b9a4d77df15be08...  model_b
1  c2ba18250bee661dfbe70ab08f6160c79d9e2ac2448a16...  model_b
2  e8d9b9e71fe263f4b3d6639faf701c259061366033955f...  model_b
3  06ba8de0f99115628320a0d57ab954ffb3d3af2d010f09...  model_b
4  9dc583fcf54d9365c05b6f7e59710dfee973b01a283f3a...  model_b
