In [None]:
# Download Dataset
!gdown https://drive.google.com/uc?id=1t30Elo92Ti8F3BOorJtVGF4MQfjMskk-

In [None]:
#Install Requirements
%pip install transformers accelerate peft bitsandbytes datasets


In [None]:
#Insert HF TOKEN
HF_TOKEN = ""

In [None]:
import torch
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForLanguageModeling
)

# Set data type for computations to be float16
compute_dtype = getattr(torch, "float16")

# Bitsandbytes 4bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # NormalFloat4 (NF4)
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False  # disables double quantization (-precision +efficiency)
)

# Load model on GPU
device_map = {"": 0}

# Load tokenizer from the pre-trained 'mistralai/Mistral-7B-v0.1'
tokenizer = AutoTokenizer.from_pretrained(
    'mistralai/Mistral-7B-v0.1',
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False,
    token=HF_TOKEN
)

# Load the pre-trained model with quantization settings
model = AutoModelForCausalLM.from_pretrained(
    'mistralai/Mistral-7B-v0.1',
    device_map=device_map,  # Specify device
    quantization_config=bnb_config,  # Apply bnb defined 4-bit quantization
    trust_remote_code=True,
    token=HF_TOKEN
)

# Set up the LoRA (Low-Rank Adaptation) config
lora_config = LoraConfig(
    r=64,  # Rank of LoRA matrix controls size of LoRA matrices.the bigger the size the more precise but energy requiring
    lora_alpha=16,  # control the impact of LoRA stregnth
    target_modules=['q_proj', 'v_proj', 'k_proj', 'dense'],  # Apply LoRA to specific layers in Mistral
    lora_dropout=0.05,  # Dropout rate for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply the LoRA configuration to the quantized model for efficient fine-tuning
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing to save memory during training
model.gradient_checkpointing_enable()


In [None]:
from datasets import load_dataset
import pyarrow as pa

# Load the dataset
data_path = "cwe_prompt_completion.json"
dataset = load_dataset('json', data_files=data_path)

In [None]:
def format_sample_text(sample):
    # Introductory statement and markers
    introduction = "The following is a question about a security vulnerability. Please give a thorough and accurate answer."
    prompt_section = f"### Prompt:\n{sample.get('prompt', '')}".strip()  # Strip to remove any extra spaces
    response_section = f"### Response:\n{sample.get('completion', '')}".strip()
    conclusion = "### End"

    # Collect all non-empty parts
    sections = [introduction, prompt_section, response_section, conclusion]
    formatted_text = "\n\n".join(filter(lambda x: x, sections))  # Only include non-empty sections

    sample["text"] = formatted_text
    return sample


In [None]:
from functools import partial

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(format_sample_text)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Pre-process dataset
from transformers import set_seed
seed = 42
set_seed(seed)
max_length = get_max_length(model)
print(max_length)
train_test_split = dataset['train'].train_test_split(test_size=0.2)

#TODO reverted to orignal code - removal of columns, there is no validation set in the dataset, you can potentially fix it or ignore it completely
train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, train_test_split['test'])

Found max lenth: 32768
32768
Preprocessing dataset...
Preprocessing dataset...


In [None]:
from peft import get_peft_model, prepare_model_for_kbit_training

#Prepare the model for efficient training with k-bit precision
model = prepare_model_for_kbit_training(model)

#Apply LoRA configuration to model
model = get_peft_model(model, lora_config)




In [None]:
from transformers import Trainer, TrainingArguments

peft_training_args = TrainingArguments(
    output_dir = './mistral-finetune',
    warmup_steps=100,  # Increased warmup steps to stabilize training(10%)
    per_device_train_batch_size=32,  # Increaseed batch size, adjusted based on A100 memory
    gradient_accumulation_steps=2,  # Adjust gradient accumulation for larger batches
    max_steps=1000,
    learning_rate=3e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=50,  # Saving less frequently to reduce I/O overhead
    evaluation_strategy="steps",
    eval_steps=50,  # Evaluate less frequently to focus on training
    do_eval=True,
    gradient_checkpointing=True,  # Keep this for saving memory during training
    bf16=True,  #set this to True (for A100 hardware)
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True,
)

model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=peft_training_args,
    eval_dataset=eval_dataset,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)

)

# Start fine-tuning the model
trainer.train()
# Save the fine-tuned model
model.save_pretrained("./mistral-qlora-finetune")
tokenizer.save_pretrained("./mistral-qlora-finetune")


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
50,1.3031,1.16714
100,1.0732,1.050644
150,0.977,0.952905
200,0.9352,0.923327
250,0.9059,0.901676
300,0.8843,0.884953
350,0.8884,0.867254
400,0.8631,0.854124
450,0.8634,0.843524
500,0.8657,0.832306


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

('./mistral-qlora-finetune/tokenizer_config.json',
 './mistral-qlora-finetune/special_tokens_map.json',
 './mistral-qlora-finetune/tokenizer.model',
 './mistral-qlora-finetune/added_tokens.json')

In [None]:
import os
import zipfile
from tqdm import tqdm

def zip_with_progress(folder_path, output_path):
    # Calculate the total size of the folder to zip
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)

    # Zip the folder and show progress using tqdm
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc="Zipping") as pbar:
            for dirpath, dirnames, filenames in os.walk(folder_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    # Add file to the zip archive
                    zipf.write(fp, os.path.relpath(fp, folder_path))
                    # Update progress bar based on file size
                    pbar.update(os.path.getsize(fp))

# Example usage:
folder_path = 'mistral-qlora-finetune'  # Replace with your folder path
output_path = 'mistral-qlora-finetune.zip'  # Replace with the zip file path
zip_with_progress(folder_path, output_path)


Zipping: 100%|██████████| 152M/152M [00:07<00:00, 21.6MB/s]


In [None]:
from IPython.display import FileLink

# Path to the zipped file
output_path = 'mistral-finetune.zip'

# Display a download link
FileLink(output_path)
