In [None]:
# Download the Question-answer pair dataset

!gdown https://drive.google.com/uc?id=1t30Elo92Ti8F3BOorJtVGF4MQfjMskk-


In [None]:
# Install necessary libraries

%pip install transformers accelerate peft bitsandbytes datasets


In [None]:
# Set Hugging Face token and import necessary libraries

HF_TOKEN = ''
import torch
from peft import LoraConfig, get_peft_model

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForLanguageModeling
)

# Load model on GPU
device_map = {"": 0}

# Load tokenizer from the pre-trained 'microsoft/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct',trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False,
    token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    'microsoft/Phi-3-mini-4k-instruct',
    device_map=device_map,
    trust_remote_code=True,
    token=HF_TOKEN)


model.gradient_checkpointing_enable()



2024-10-31 17:29:07.399204: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-31 17:29:07.411802: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-31 17:29:07.426826: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-31 17:29:07.431374: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-31 17:29:07.442747: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset
import pyarrow as pa

# Load the dataset
data_path = "cwe_prompt_completion.json"
dataset = load_dataset('json', data_files=data_path)

In [None]:
def format_sample_text(sample):
    # Introductory statement and markers
    introduction = "The following is a question about a security vulnerability. Please give a thorough and accurate answer."
    prompt_section = f"### Prompt:\n{sample.get('prompt', '')}".strip()  # Strip to remove any extra spaces
    response_section = f"### Response:\n{sample.get('completion', '')}".strip()
    conclusion = "### End"

    # Collect all non-empty parts
    sections = [introduction, prompt_section, response_section, conclusion]
    formatted_text = "\n\n".join(filter(lambda x: x, sections))  # Only include non-empty sections

    sample["text"] = formatted_text
    return sample

In [None]:
from functools import partial

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(format_sample_text)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Pre-process dataset
from transformers import set_seed
seed = 42
set_seed(seed)
max_length = get_max_length(model)
print(max_length)
train_test_split = dataset['train'].train_test_split(test_size=0.2)

#TODO reverted to orignal code - removal of columns, there is no validation set in the dataset, you can potentially fix it or ignore it completely
train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, train_test_split['test'])

In [None]:
from peft import get_peft_model, prepare_model_for_kbit_training

#Prepare the model for efficient training with k-bit precision
model = prepare_model_for_kbit_training(model)


In [None]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


# Tokenize the dataset
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

# Prepare the dataset for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [None]:
for param in model.parameters():
    param.requires_grad = True

In [None]:
from transformers import Trainer, TrainingArguments

peft_training_args = TrainingArguments(
    output_dir='./Phi-3-mini-4k-instruct',
    warmup_steps=200,  # Increased to 10% of total steps
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # Adjusted to keep effective batch size manageable
    max_steps=2000,
    learning_rate=5e-6,  # Experimented with a slightly lower learning rate
    optim="adamw_hf",  # Tried regular AdamW for more stable training
    logging_steps=50,  # Logged more frequently to monitor progress
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=200,  # Saved more frequently to monitor performance
    eval_strategy="steps",
    eval_steps=200,  # Evaluated more frequently
    do_eval=True,
    gradient_checkpointing=True,
    bf16=True,  # good for A100 hardware
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True,
    save_total_limit=3  # Limit number of checkpoints to save space
)

model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=peft_training_args,
    eval_dataset=eval_dataset,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Start fine-tuning the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./Phi-3-mini-4k-instruct")
tokenizer.save_pretrained("./Phi-3-mini-4k-instruct")


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,1.1068,1.109053
400,0.9527,0.959703
600,0.904,0.895429
800,0.8627,0.854669
1000,0.8477,0.82074
1200,0.8251,0.796678
1400,0.8193,0.777079
1600,0.7913,0.762171
1800,0.8133,0.752755
2000,0.8122,0.748963


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

('./Phi-3-mini-4k-instruct/tokenizer_config.json',
 './Phi-3-mini-4k-instruct/special_tokens_map.json',
 './Phi-3-mini-4k-instruct/tokenizer.model',
 './Phi-3-mini-4k-instruct/added_tokens.json')

In [None]:
import os
import zipfile
from tqdm import tqdm

def zip_with_progress(folder_path, output_path):
    # Calculate the total size of the folder to zip
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)

    # Zip the folder and show progress using tqdm
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc="Zipping") as pbar:
            for dirpath, dirnames, filenames in os.walk(folder_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    # Add file to the zip archive
                    zipf.write(fp, os.path.relpath(fp, folder_path))
                    # Update progress bar based on file size
                    pbar.update(os.path.getsize(fp))

# Example usage:
folder_path = 'Phi-3-mini-4k-instruct'  # Replace with your folder path
output_path = 'Phi-3-finetune5.zip'  # Replace with the zip file path
zip_with_progress(folder_path, output_path)


Zipping: 100%|██████████| 15.3G/15.3G [10:40<00:00, 23.9MB/s]


In [None]:
from IPython.display import FileLink

# Path to the zipped file

# Display a download link
FileLink(output_path)
