In [None]:
%pip install -q sentencepiece
%pip install -q protobuf
%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q scipy
%pip install -q -U trl

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, PeftConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset, concatenate_datasets
import pandas as pd
import math

In [None]:
from torch import nn
import re
import ast
import numpy as np

class CustomSFTTrainer(SFTTrainer):
    def __init__(self, tokenizer, *args, **kwargs):
        super(CustomSFTTrainer, self).__init__(*args, **kwargs)
        self.tokenizer = tokenizer

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs_copy = inputs.copy()
        labels = inputs_copy.pop("labels")
        outputs = model(**inputs_copy)
        logits = outputs.logits
        
        # Compute the loss using the superclass's compute_loss method
        super_loss, _ = super().compute_loss(model, inputs, return_outputs=True)
        
        labels_decoded = decoded = self.tokenizer.batch_decode(labels.tolist())
        labels_result = labels_decoded[0]
        target_match = re.search(r"Test_1_Output=\[\[.*?\]\]", labels_result)
        target_array_string = target_match.group(0).replace("Test_1_Output=", "")
        target_array = np.array(ast.literal_eval(target_array_string))

        # Apply softmax to logits to get probabilities
        probs = torch.softmax(logits, dim=-1)
        # Get the predicted tokens (highest probability)
        _, predictions = torch.max(probs, dim=-1)
        decoded = self.tokenizer.batch_decode(predictions.tolist())
        predictions_result = decoded[0]
        match1 = re.search(r"Test_1_Output =\[\[.*?\]\]", predictions_result)
        match2 = re.search(r"Hello_1_Output =\[\[.*?\]\]", predictions_result)
        match3 = re.search(r"To_1_Output =\[\[.*?\]\]", predictions_result)
        predicted_array_string = ""
        if match1:
            predicted_array_string = match1.group(0).replace("Test_1_Output =", "")
        elif match2:
            predicted_array_string = match2.group(0).replace("Test_1_Output =", "")
        elif match3:
            predicted_array_string = match3.group(0).replace("Test_1_Output =", "")
        
        predicted_array = np.array(ast.literal_eval(predicted_array_string))
        
        loss = super_loss
        wrong_proportion = 1
        if predicted_array.shape == target_array.shape:
            print("\ntarget")
            print(target_array)
            print("\npredicted")
            print(predicted_array)
            loss *= 0.5
            equal_elements_count = np.sum(np.equal(target_array, predicted_array))
            total_elements_count = target_array.size
            wrong_elements_count = total_elements_count - equal_elements_count
            wrong_proportion = wrong_elements_count / total_elements_count
            loss *= wrong_proportion
            print("super_loss:", super_loss, "   wrong_proportion:", wrong_proportion, "   loss:", loss)

        return (loss, outputs) if return_outputs else loss

In [None]:
def generate_prompt(data_point):
    user_message = data_point["instruction"]
    assistant_message = data_point["output"]
    text = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n{assistant_message}<|im_end|>"
    return text

In [None]:
base_model_name = "microsoft/Orca-2-13b"
new_adapter_name = "Orca-2-13b-4bit-finetune-on-ARC-10"

# Load the entire model on the GPU 0
device_map = {"": 0}

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
# Load the data using pandas
data_file = "data/ARC_augmented_training_puzzles.json"
df = pd.read_json(data_file)
# Convert the pandas dataframe to a dataset
dataset1 = Dataset.from_pandas(df)

# Load the data using pandas
data_file = "data/ARC_augmented_evaluation_puzzles.json"
df = pd.read_json(data_file)
# Convert the pandas dataframe to a dataset
dataset2 = Dataset.from_pandas(df)

# Load the data using pandas
data_file = "data/core_ARC_puzzles.json"
df = pd.read_json(data_file)
# Convert the pandas dataframe to a dataset
dataset3 = Dataset.from_pandas(df)

# Generate the "prompt" column for each dataset
dataset1 = dataset1.map(lambda data_point: {"prompt": generate_prompt(data_point)})
dataset2 = dataset2.map(lambda data_point: {"prompt": generate_prompt(data_point)})
dataset3 = dataset3.map(lambda data_point: {"prompt": generate_prompt(data_point)})

# Assuming dataset1 and dataset2 are instances of Dataset from the `datasets` library
dataset = concatenate_datasets([dataset1, dataset2, dataset3])

In [None]:
def main():
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map=device_map,
    )
    model.config.use_cache = False
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)  

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

    per_device_train_batch_size = 1
    gradient_accumulation_steps = 4
    max_seq_length = 4096

    output_dir = "results/" + new_adapter_name

    steps_per_epoch = len(dataset) // (
        per_device_train_batch_size * gradient_accumulation_steps
    )
    print("Steps:", steps_per_epoch)

    # Set training parameters
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        max_steps=steps_per_epoch,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        save_strategy="steps",
        evaluation_strategy="no",
        save_steps=100,
        logging_steps=1,
        learning_rate=1e-5,
        fp16=True,
        warmup_steps=0.03,
        group_by_length=True,
        gradient_checkpointing=True,
    )

    trainer = CustomSFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )

    # Train model
    trainer.train()

    # Save trained model
    adapter = "adapters/" + new_adapter_name
    trainer.model.save_pretrained(adapter)    

In [None]:
main()