In [1]:
# %pip install -q sentencepiece
# %pip install -q protobuf

In [2]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q scipy
# %pip install -q -U trl

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, PeftConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset, concatenate_datasets
import pandas as pd
import math



In [4]:
from torch import nn

class CustomSFTTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super(CustomSFTTrainer, self).__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs_copy = inputs.copy()
        labels = inputs_copy.pop("labels")
        outputs = model(**inputs_copy)
        logits = outputs.logits

        # Apply softmax to logits to get probabilities
        probs = torch.softmax(logits, dim=-1)
        # Get the predicted tokens (highest probability)
        _, predictions = torch.max(probs, dim=-1)
        # Calculate the number of wrong predictions
        wrong = (predictions != labels).sum().float()
        # Calculate the proportion of wrong predictions
        wrong_proportion = wrong / labels.numel()

        # Compute the loss using the superclass's compute_loss method
        super_loss, _ = super().compute_loss(model, inputs, return_outputs=True)

        # Multiply the superclass's loss by wrong_proportion
        loss = super_loss * wrong_proportion
        
        print("super_loss:", super_loss, "   wrong_proportion:", wrong_proportion, "   loss:", loss)

        return (loss, outputs) if return_outputs else loss

In [5]:
def generate_prompt(data_point):
    user_message = data_point["instruction"]
    assistant_message = data_point["output"]
    text = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n{assistant_message}<|im_end|>"
    return text


def main():
    base_model_name = "microsoft/Orca-2-13b"
    new_adapter_name = "Orca-2-13b-4bit-finetune-on-ARC-9"

    # Load the entire model on the GPU 0
    device_map = {"": 0}

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
    )

    # Load the data using pandas
    data_file = "data/ARC_augmented_training_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset1 = Dataset.from_pandas(df)
    
    # Load the data using pandas
    data_file = "data/ARC_augmented_evaluation_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset2 = Dataset.from_pandas(df)
    
    # Load the data using pandas
    data_file = "data/core_ARC_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset3 = Dataset.from_pandas(df)
    
    # Generate the "prompt" column for each dataset
    dataset1 = dataset1.map(lambda data_point: {"prompt": generate_prompt(data_point)})
    dataset2 = dataset2.map(lambda data_point: {"prompt": generate_prompt(data_point)})
    dataset3 = dataset3.map(lambda data_point: {"prompt": generate_prompt(data_point)})

    # Assuming dataset1 and dataset2 are instances of Dataset from the `datasets` library
    dataset = concatenate_datasets([dataset1, dataset2, dataset3])
    
    model_checkpoint = "results/Orca-2-13b-4bit-finetune-on-ARC-8/checkpoint-7800"
    peft_model_id = model_checkpoint
    config = PeftConfig.from_pretrained(peft_model_id)
    model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                                        quantization_config=bnb_config,
                                                        return_dict=True,
                                                        load_in_4bit=True,
                                                        device_map={"":0})

    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

#     # Load the Lora model
#     model = PeftModel.from_pretrained(model, peft_model_id)
    
#     # Ensure all model parameters require gradients
#     for param in model.parameters():
#         param.requires_grad = True

    per_device_train_batch_size = 1
    gradient_accumulation_steps = 4
    max_seq_length = 4096

    output_dir = "results/" + new_adapter_name

    steps_per_epoch = len(dataset) // (
        per_device_train_batch_size * gradient_accumulation_steps
    )
    print("Steps:", steps_per_epoch)

    # Set training parameters
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        max_steps=steps_per_epoch,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        save_strategy="steps",
        evaluation_strategy="no",
        save_steps=100,
        logging_steps=1,
        learning_rate=1e-5,
        fp16=True,
        warmup_steps=0.03,
        group_by_length=True,
        gradient_checkpointing=True,
    )

    trainer = CustomSFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )

    # Train model
    trainer.train()

    # Save trained model
    adapter = "adapters/" + new_adapter_name
    trainer.model.save_pretrained(adapter)


if __name__ == "__main__":
    main()

Map:   0%|          | 0/11240 [00:00<?, ? examples/s]

Map:   0%|          | 0/72504 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Steps: 23436


Map:   0%|          | 0/93744 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


super_loss: tensor(0.1814, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.1813, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.1925, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.3388, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.3386, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.3322, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.3320, device='cuda:0', grad_fn=<MulBackward0>)


Step,Training Loss
1,0.2611
2,0.4706


super_loss: tensor(0.8118, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9997, device='cuda:0')    loss: tensor(0.8115, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.1846, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.1726, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.7140, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.7136, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.7094, device='cuda:0', grad_fn=<NllLossBackward0>)    wrong_proportion: tensor(0.9995, device='cuda:0')    loss: tensor(0.7090, device='cuda:0', grad_fn=<MulBackward0>)
super_loss: tensor(0.6957, device='cuda:0', grad_f

KeyboardInterrupt: 