In [None]:
# %pip install -q sentencepiece
# %pip install -q protobuf

In [None]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q scipy
# %pip install -q -U trl

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, PeftConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset, concatenate_datasets
import pandas as pd

In [None]:
class LabelSmoother:
    """
    Adds label-smoothing on a pre-computed output from a Transformers model.

    Args:
        epsilon (`float`, *optional*, defaults to 0.1):
            The label smoothing factor.
        ignore_index (`int`, *optional*, defaults to -100):
            The index in the labels to ignore when computing the loss.
    """

    epsilon: float = 0.1
    ignore_index: int = -100

    def __call__(self, model_output, labels, shift_labels=False):
        logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
        if shift_labels:
            logits = logits[..., :-1, :].contiguous()
            labels = labels[..., 1:].contiguous()

        log_probs = -nn.functional.log_softmax(logits, dim=-1)
        if labels.dim() == log_probs.dim() - 1:
            labels = labels.unsqueeze(-1)

        padding_mask = labels.eq(self.ignore_index)
        # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
        # will ignore them in any case.
        labels = torch.clamp(labels, min=0)
        nll_loss = log_probs.gather(dim=-1, index=labels)
        # works for fp16 input tensor too, by internally upcasting it to fp32
        smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)

        nll_loss.masked_fill_(padding_mask, 0.0)
        smoothed_loss.masked_fill_(padding_mask, 0.0)

        # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
        num_active_elements = padding_mask.numel() - padding_mask.long().sum()
        nll_loss = nll_loss.sum() / num_active_elements
        smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
        return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss

In [None]:
from torch import nn

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

class CustomSFTTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super(CustomSFTTrainer, self).__init__(*args, **kwargs)
        
        # Label smoothing
        if self.args.label_smoothing_factor != 0:
            self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
        else:
            self.label_smoother = None

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            unwrapped_model = unwrap_model(model)
            if is_peft_available() and isinstance(unwrapped_model, PeftModel):
                model_name = unwrapped_model.base_model.model._get_name()
            else:
                model_name = unwrapped_model._get_name()
            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. \
                    For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        
#         logits = outputs.get("logits").view(-1, self.model.config.num_labels)
#         labels = inputs.pop("labels")
#         labels = labels.view(-1)
        
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        
        # Apply softmax to logits to get probabilities
        probs = torch.softmax(logits, dim=-1)

        # Get the predicted tokens (highest probability)
        _, predictions = torch.max(probs, dim=-1)
        
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        print("predictions")
        print(predictions)
        print("labels")
        print(labels)
        
        
        
        
        
        # model.eval()
        # with torch.no_grad():
        #     generated_ids = model.generate(**inputs)
        # decoded = tokenizer.batch_decode(generated_ids)
        # result = decoded[0]
        
        
        
        
        
        

        # Check if predictions exactly match labels
        exact_match = torch.all(predictions == labels, dim=-1).float()

        # Loss is 0 if exact match, 1 otherwise
        exact_match_loss = 1 - exact_match
        
        # Mean loss over the batch
        exact_match_loss = exact_match_loss.mean()
        
        print("smooth_label_loss", loss)
        print("exact_match_loss", exact_match_loss)
        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        
        loss *= exact_match_loss

        return (loss, outputs) if return_outputs else loss

In [None]:
def generate_prompt(data_point):
    user_message = data_point["instruction"]
    assistant_message = data_point["output"]
    text = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n{assistant_message}<|im_end|>"
    return text


def main():
    base_model_name = "microsoft/Orca-2-13b"
    new_adapter_name = "Orca-2-13b-4bit-finetune-on-ARC-4"

    # Load the entire model on the GPU 0
    device_map = {"": 0}

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
    )

    # Load the data using pandas
    data_file = "data/ARC_augmented_training_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset1 = Dataset.from_pandas(df)
    
    # Load the data using pandas
    data_file = "data/ARC_augmented_evaluation_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset2 = Dataset.from_pandas(df)
    
    # Load the data using pandas
    data_file = "data/core_ARC_puzzles.json"
    df = pd.read_json(data_file)
    # Convert the pandas dataframe to a dataset
    dataset3 = Dataset.from_pandas(df)
    
    # Generate the "prompt" column for each dataset
    dataset1 = dataset1.map(lambda data_point: {"prompt": generate_prompt(data_point)})
    dataset2 = dataset2.map(lambda data_point: {"prompt": generate_prompt(data_point)})
    dataset3 = dataset3.map(lambda data_point: {"prompt": generate_prompt(data_point)})

    # Assuming dataset1 and dataset2 are instances of Dataset from the `datasets` library
    dataset = concatenate_datasets([dataset1, dataset2, dataset3])

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        # local_files_only=True  # Add this line if the model is stored locally
    )
    model.config.use_cache = False
    # model.config.pretraining_tp = 1
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True)

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 4
    max_seq_length = 4096

    output_dir = "results/" + new_adapter_name

    steps_per_epoch = len(dataset) // (
        per_device_train_batch_size * gradient_accumulation_steps
    )
    print("Steps:", steps_per_epoch)

    # Set training parameters
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        max_steps=steps_per_epoch,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        save_strategy="steps",
        evaluation_strategy="no",
        save_steps=100,
        logging_steps=1,
        learning_rate=1e-4,
        fp16=True,
        warmup_steps=0.03,
        group_by_length=True,
        gradient_checkpointing=True,
    )

    trainer = CustomSFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )

#     # Set supervised fine-tuning parameters
#     trainer = SFTTrainer(
#         model=model,
#         train_dataset=dataset,
#         peft_config=peft_config,
#         dataset_text_field="prompt",
#         max_seq_length=max_seq_length,
#         tokenizer=tokenizer,
#         args=training_arguments,
#     )

    # Train model
    trainer.train()

    # Save trained model
    adapter = "adapters/" + new_adapter_name
    trainer.model.save_pretrained(adapter)


if __name__ == "__main__":
    main()