<a href="https://colab.research.google.com/github/sentrysol666-sys/Sentry-copilot/blob/main/llama3_1_8b_AML_CFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.4.0+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os, json
import pandas as pd
from datasets import Dataset

dataset_path = "/content/drive/MyDrive/Colab Notebooks/dataset/solana-vuln-sim-10k.csv"

ext = os.path.splitext(dataset_path)[1].lower()
rows = []

if ext == ".jsonl":
    with open(dataset_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            combined_text = " ".join([f"{k}: {v}" for k, v in data.items()])
            rows.append({"text": combined_text})
elif ext in [".csv", ".tsv"]:
    sep = "," if ext == ".csv" else "\t"
    df = pd.read_csv(dataset_path, sep=sep)
    for _, row in df.iterrows():
        combined_text = " ".join([f"{col}: {val}" for col, val in row.items()])
        rows.append({"text": combined_text})
else:
    raise ValueError("Unsupported format! Use .csv, .tsv, or .jsonl.")

dataset = Dataset.from_list(rows)
print("Dataset loaded:", dataset)

In [None]:
from unsloth import FastLanguageModel

model_name = "unsloth/llama-3.2-3b-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    device_map="auto",
)

print("Base model loaded!")

In [None]:
from peft import LoraConfig

model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

model.print_trainable_parameters()


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=20,
    max_steps=200,       # ubah sesuai dataset
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    output_dir="lora-llama32-3b",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    packing=True,
    args=training_args,
)

print("Starting training ...")
trainer.train()
print("Training completed!")

In [None]:
save_dir = "/content/drive/MyDrive/lora-llama32-3b"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Fine-tuned model saved to {save_dir}")


In [None]:
from transformers import pipeline

# Reload base + attach adapter
finetuned_model, finetuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    device_map="auto",
)
finetuned_model.load_adapter(save_dir)

# Pipeline
pipe = pipeline(
    "text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
)

# Prompt contoh
prompt = finetuned_tokenizer.apply_chat_template(
    [{"role": "user", "content": "Jelaskan kerentanan reentrancy pada smart contract sederhana."}],
    tokenize=False,
    add_generation_prompt=True,
)

# Generate
result = pipe(prompt)[0]["generated_text"]
print("🔎 Inference Result:\n", result)

# Save output ke Google Drive
with open("/content/drive/MyDrive/inference_result.txt", "w") as f:
    f.write(result)

print("Inference result saved to Google Drive!")