<a href="https://colab.research.google.com/github/sdashrath/SmartContractAuditing/blob/main/QLoRaLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip uninstall datasets -y
!pip install datasets
!pip install --upgrade bitsandbytes
!pip install --upgrade transformers accelerate
import os
import torch
import json
from sklearn.metrics import accuracy_score
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


# Define dataset
data = [
    {"instruction": "Survey the area and capture images every 10 meters.", "output": "tc(180);g('camera');"},
    {"instruction": "Return to base if battery < 20%.", "output": "rtb();"},
    {"instruction": "Activate thermal sensor at 50m altitude.", "output": "activate_thermal(50);"},
    {"instruction": "Take a photo and then hover for 10 seconds.", "output": "photo();hover(10);"}
]

# Create and split dataset
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)

# Load model and tokenizer
model_name = "unsloth/llama-3-8b-bnb-4bit"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    trust_remote_code=True
)

# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"]
)
peft_model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Reduced batch size to 1
    gradient_accumulation_steps=4, # Added gradient accumulation
    learning_rate=2e-4,
    num_train_epochs=3,
    save_steps=10,
    save_total_limit=2,
    gradient_checkpointing=True # Added gradient checkpointing
)

# Preprocess dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["instruction"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    labels = tokenizer(examples["output"], truncation=True, padding="max_length", max_length=512, return_tensors="pt").input_ids
    # Ensure labels are on the correct device, no need for gradients or detach
    inputs["labels"] = labels.to(inputs["input_ids"].device)
    # Use a mask to replace pad_token_id with -100 without in-place operation
    inputs["labels"][inputs["labels"] == tokenizer.pad_token_id] = -100
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Custom Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)  # Pass all arguments to the superclass constructor

    def training_step(self, model: PeftModel, inputs: dict, optimizers = None) -> torch.Tensor: # Added optimizers parameter
        model.train()
        inputs = self._prepare_inputs(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        # Removed the conditional check for do_grad_scaling
        # Gradient scaling is handled internally by the Trainer class now
        loss.backward()

        # Return the loss without detaching it
        return loss


# Initialize Trainer
trainer = CustomTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Log metrics
metrics = {
    "train_loss": trainer.state.log_history[-1].get("loss", "N/A"),
    "eval_loss": results.get("eval_loss", "N/A"),
    "accuracy": results.get("accuracy", "N/A")
}
with open("performance_metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)
print("Metrics saved to performance_metrics.json")

# Generate predictions for new prompts
prompts = [
    "Generate code to survey an area and return to the base.",
    "Activate thermal sensor at 50m altitude."
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = peft_model.generate(**inputs, max_length=50)
    print(f"Input: {prompt}")
    print(f"Output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

# Compute accuracy (example)
y_true = ["tc(180);g('camera');", "rtb();"]
y_pred = ["tc(180);g('camera');", "rtb();"]  # Replace with actual predictions
print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")

Found existing installation: datasets 3.2.0
Uninstalling datasets-3.2.0:
  Successfully uninstalled datasets-3.2.0
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Installing collected packages: datasets
Successfully installed datasets-3.2.0


[31mERROR: Operation cancelled by user[0m[31m
[0m

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
