In [2]:
## exp2 - Gemma with competition dataset 


In [3]:
# Modal
# !pip install bitsandbytes==0.42.0 peft==0.11.0 -qq
# !pip install hf_transfer

In [6]:
jarvis = False
modal = True
vastai = False
hf = False
OUTPUT_DIR = "output_exp2"

DEBUG = False

import os
import copy
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import joblib
import json 
import neptune

from datasets import Dataset
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BitsAndBytesConfig,
    LlamaPreTrainedModel, Gemma2PreTrainedModel,
    LlamaModel,Gemma2Model,
    AutoTokenizer,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

from huggingface_hub import login


In [7]:
if jarvis:
    NEPTUNE_API_TOKEN = json.load(open("../apikeys.json"))["NEPTUNE_API_TOKEN"]
    TRAIN_CSV = "/home/lmsys/input/train.csv"
    FOLD_PATH = "//home/lmsys/input/fold.pkl"
elif modal:
    NEPTUNE_API_TOKEN = json.load(open("apikeys.json"))["NEPTUNE_API_TOKEN"]
    TRAIN_CSV = "/root/cache/input/train.csv"
    FOLD_PATH = "/root/cache/input/fold.pkl"
    EXTERNAL_CSV = "/root/cache/input/external_datasets/lmsys_ext_df_hs.csv"
elif vastai:
    NEPTUNE_API_TOKEN = json.load(open("../apikeys.json"))["NEPTUNE_API_TOKEN"]
    TRAIN_CSV = "/root/lmsys/input/train.csv"
    FOLD_PATH = "/root/lmsys/input/fold.pkl"
    # https://www.kaggle.com/code/rashmibanthia/lmsys-preprocess/output?scriptVersionId=189364525&select=lmsys_ext_df_hs.csv
    EXTERNAL_CSV = "/root/lmsys/input/external_datasets/lmsys_ext_df_hs.csv"
elif hf:
    NEPTUNE_API_TOKEN = json.load(open("../apikeys.json"))["NEPTUNE_API_TOKEN"]
    TRAIN_CSV = "/data/lmsys/input/train.csv"
    FOLD_PATH = "/data/lmsys/input/fold.pkl"
    login(token=json.load(open("../apikeys.json"))["HF_TOKEN"])
    EXTERNAL_CSV = "/data/lmsys/input/external_datasets/lmsys_ext_df_hs.csv"
else:
    NEPTUNE_API_TOKEN = json.load(open("../../apikeys.json"))["NEPTUNE_API_TOKEN"]
    TRAIN_CSV = "/home/rashmi/Documents/kaggle/lmsys/input/train.csv"
    FOLD_PATH = "/home/rashmi/Documents/kaggle/lmsys/input/fold.pkl"

os.environ["NEPTUNE_API_TOKEN"] = NEPTUNE_API_TOKEN
os.environ["NEPTUNE_PROJECT"] = "lmsys/lmsys"
os.environ["HF_TRANSFER"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

    

In [8]:
# model_path = "unsloth/llama-3-8b-Instruct-bnb-4bit"
# model_path =  "RLHFlow/pair-preference-model-LLaMA3-8B"
# model_path = "google/gemma-2-9b-it"
model_path = "unsloth/gemma-2-9b-it-bnb-4bit" 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_LENGTH = 1024
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
columns_to_vectorize = ["prompt", "response_a", "response_b"]

train = pd.read_csv(TRAIN_CSV)

# ## We have folds in External data
# fold1_idx, val_idx = train[train.fold!=0].index , train[train.fold==0].index
# print(len(fold1_idx), len(val_idx))

## Incorrect labels
list1 = train[(train['response_a'] == train['response_b']) & (train.winner_tie!=1) ]['id'].tolist()
idx = train[train.id.isin(list1)].index
train.loc[idx,'winner_model_a'] = 0
train.loc[idx,'winner_model_b'] = 0
train.loc[idx,'winner_tie'] = 1

# train = train.head(100)
train['label'] = train[target_columns].idxmax(axis=1) 
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
train = train[columns_to_vectorize + ['label']]


In [9]:
print(train.shape)
train.head(1)

(57477, 4)


Unnamed: 0,prompt,response_a,response_b,label
0,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",0


Tokenizer and prepare dataset, metrics¶

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_eos_token = True
tokenizer.padding_side = 'right'

LABEL_IDS = [tokenizer(i, add_special_tokens=False)["input_ids"][0] for i in ['a', 'b', 'tie']]

def tokenize(example, tokenizer):
    prompt = tokenizer('<prompt>: ' + " ".join(eval(example['prompt'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_a = tokenizer('\n\n<response_a>: ' + " ".join(eval(example['response_a'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_b = tokenizer('\n\n<response_b>: ' + " ".join(eval(example['response_b'], {"null": ""})), add_special_tokens=False)["input_ids"]
    if len(prompt+response_a+response_b) > MAX_LENGTH:
        prompt = tokenizer('<prompt>: ' + eval(example['prompt'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:200]
        response_a = tokenizer('\n\n<response_a>: ' + eval(example['response_a'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:400]
        response_b = tokenizer('\n\n<response_b>: ' + eval(example['response_b'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:400]
    extra_prompt = tokenizer('\n\n---------\nWhich is the better response for the prompt ? a or b or tie ?\n\nAnswer: ', add_special_tokens=False)["input_ids"]

    label_token_id = LABEL_IDS[int(example['label'])]
    input_ids = [tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt + [label_token_id] + [tokenizer.eos_token_id]
    attention_mask = len(input_ids)*[1]
    labels = [-100]* len([tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt) + [label_token_id] + [tokenizer.eos_token_id]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [11]:
def load_data(df, tokenizer):
    raw_datasets = Dataset.from_pandas(df)
    tokenized_datasets = raw_datasets.map(
        tokenize, 
        remove_columns=raw_datasets.column_names,
        fn_kwargs={'tokenizer': tokenizer}
    )
    return tokenized_datasets

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    label_tokens_ids = np.array(LABEL_IDS)
    index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
    labels = labels[np.isin(labels, label_tokens_ids)]
    labels = np.array([index_mapping[label.item()] for label in labels])
    acc = accuracy_score(labels, preds)
    probs = softmax(logits, axis=-1)

    ## save preds
    np.save(f"{OUTPUT_DIR}/preds_{fold_idx}.npy", probs)

    log_loss_ = log_loss(labels, probs)
    return {'accuracy': acc, 'log_loss': log_loss_}


fold_idx = 0
ds = load_data(train, tokenizer)

## Folds
cv = joblib.load(FOLD_PATH)
val_idx = cv[0][1] # fold 0
fold1_idx = cv[0][0]
print("len(val_idx), len(fold1_idx): ", len(val_idx), len(fold1_idx))
train_idx, eval_idx = fold1_idx, val_idx

if DEBUG:
    train_idx, eval_idx = train_idx[:100], eval_idx[:100]


Map:   0%|          | 0/57477 [00:00<?, ? examples/s]

len(val_idx), len(fold1_idx):  5748 51729


Model

In [12]:
class Llama3ForSFT(Gemma2PreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = Gemma2Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids= None,
        attention_mask= None,
        position_ids = None,
        past_key_values= None,
        inputs_embeds= None,
        labels= None,
        use_cache= None,
        output_attentions= None,
        output_hidden_states = None,
        return_dict= None,
        cache_position = None,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        hidden_states = outputs[0]
        # if self.config.pretraining_tp > 1:
        #     lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
        #     logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
        #     logits = torch.cat(logits, dim=-1)
        # else:
        
        logits = self.lm_head(hidden_states)
        if self.config.final_logit_softcapping is not None:
            logits = logits / self.config.final_logit_softcapping
            logits = torch.tanh(logits)
            logits = logits * self.config.final_logit_softcapping

        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)

            label_tokens_ids = torch.tensor(LABEL_IDS,device=shift_labels.device)
            index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
            true_labels = shift_labels[torch.isin(shift_labels, label_tokens_ids)]
            true_labels = torch.tensor([index_mapping[label.item()] for label in true_labels], device=true_labels.device)
            true_logits = shift_logits[torch.isin(shift_labels, label_tokens_ids)][:,label_tokens_ids]
            loss = loss_fct(true_logits, true_labels)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=true_logits,
        )

In [13]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'k_proj', 'v_proj',], 
)


quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_quant_type="nf4",
                )

model = Llama3ForSFT.from_pretrained(
    model_path, 
    torch_dtype=torch.float16, 
    # quantization_config=quantization_config, 
    attn_implementation='eager'
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
# model.enable_input_require_grads()
print(model)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Llama3ForSFT were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Llama3ForSFT(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
             

In [14]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir = True,
    eval_strategy = "epoch", #steps
    save_strategy = "steps",
    save_steps=2000,
    # eval_steps=2000,
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    warmup_steps=20,
    optim="adamw_8bit", #"paged_adamw_32bit", #
    learning_rate=2e-4,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    # bf16=True,
    metric_for_best_model="log_loss",
    greater_is_better = False,
    report_to=None if DEBUG  else "neptune",
)

In [15]:
eval_dataset = ds.select(eval_idx)

# Define a function to get the length of input_ids
def get_input_ids_length(example):
    return {"input_length": len(example["input_ids"])}

# Add a new column with the length of input_ids
eval_dataset = eval_dataset.map(get_input_ids_length)

# Sort the dataset by the length of input_ids in descending order
eval_dataset = eval_dataset.sort("input_length", reverse=True)

# Remove the temporary input_length column
eval_dataset = eval_dataset.remove_columns(["input_length"])


Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

In [16]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=ds.select(train_idx),
    eval_dataset=eval_dataset, #.select(eval_idx),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/lmsys/lmsys/e/LMSYS-180




Epoch,Training Loss,Validation Loss,Accuracy,Log Loss,Runtime,Samples Per Second,Steps Per Second
0,0.9236,0.902794,0.57707,0.902794,557.7507,10.306,2.576




[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 13 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 13 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/lmsys/lmsys/e/LMSYS-180/metadata


TrainOutput(global_step=3233, training_loss=0.9701052817478292, metrics={'train_runtime': 22402.5965, 'train_samples_per_second': 2.309, 'train_steps_per_second': 0.144, 'total_flos': 2.399592130181923e+18, 'train_loss': 0.9701052817478292, 'epoch': 0.9999226784195469})

In [17]:
# create directory for tokenizer
os.makedirs(f'{OUTPUT_DIR}/tokenizer', exist_ok=True)

tokenizer.save_pretrained(f'{OUTPUT_DIR}/tokenizer/')

('output_exp2/tokenizer/tokenizer_config.json',
 'output_exp2/tokenizer/special_tokens_map.json',
 'output_exp2/tokenizer/tokenizer.model',
 'output_exp2/tokenizer/added_tokens.json',
 'output_exp2/tokenizer/tokenizer.json')

In [18]:
arr = np.load(f"{OUTPUT_DIR}/preds_0.npy")
print(arr.shape)
print(log_loss(train.loc[eval_idx, 'label'].values, arr) )

(5748, 3)
1.372473546096855


In [19]:
torch.save(model.lm_head.state_dict(), f'{OUTPUT_DIR}/lm_head.pth')

In [25]:
# # torch.save(
# #     {"model": model.state_dict()},
# #     f"output/save_model/model_fold0_best.pth",
# # )
model.model.save_pretrained(f'{OUTPUT_DIR}/')

In [21]:
ds.select(eval_idx)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5748
})

In [22]:
trainer_e = Trainer(
    args=args,
    model=model,
    eval_dataset=ds.select(eval_idx),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Evaluate the model
evaluation_results = trainer_e.evaluate()

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/lmsys/lmsys/e/LMSYS-184


In [23]:
evaluation_results

{'eval_loss': 0.9028096199035645,
 'eval_accuracy': 0.5765483646485734,
 'eval_log_loss': 0.902809530950303,
 'eval_runtime': 769.2143,
 'eval_samples_per_second': 7.473,
 'eval_steps_per_second': 1.868}

In [24]:
arr = np.load(f"{OUTPUT_DIR}/preds_0.npy")
print(arr.shape)
print(log_loss(train.loc[eval_idx, 'label'].values, arr) )

(5748, 3)
0.902809530950303
