In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!nvidia-smi

# package downlowed and importing

In [None]:
# You only need to run this once per machine
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib
!pip install torch==2.0.1

In [None]:
# setting up accelerator

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import torch


fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

# loading data

In [None]:
import csv
import json
import random

csv_file_path = 'data/vpc_raw_data.csv'
train_jsonl_file_path = 'data/train_data.jsonl'
eval_jsonl_file_path = 'data/validation_data.jsonl'
train_ratio = 0.7  # Adjust this as needed

# Read the CSV file and shuffle the rows
with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    rows = list(csv_reader)
    random.shuffle(rows)  # Shuffle the rows to randomize the data split

# Compute the split index
split_index = int(train_ratio * len(rows))

# Split the rows into training and evaluation sets
train_rows = rows[:split_index]
eval_rows = rows[split_index:]

# Function to write rows to a jsonl file
def write_to_jsonl(file_path, rows):
    with open(file_path, mode='w', encoding='utf-8') as file:
        for row in rows:
            json_object = {
                'input': row['prompt'],
                'output': row['response']
            }
            file.write(json.dumps(json_object) + '\n')

# Write the training and evaluation sets to their respective files
write_to_jsonl(train_jsonl_file_path, train_rows)
write_to_jsonl(eval_jsonl_file_path, eval_rows)

print(f'Training data saved to {train_jsonl_file_path}')
print(f'Evaluation data saved to {eval_jsonl_file_path}')


In [None]:
# initiate this chunk for reproducing what i did
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='data/train_data.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='data/validation_data.jsonl', split='train')

In [None]:
#formating prompt
def formatting_func(example):
    text = f"### Question: {example['input']}\n ### Answer: {example['output']}"
    return text

# load base model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "NousResearch/Llama-2-7b-hf" #weigts repo from hugging  face
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

# toenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(formatting_func(prompt))

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tokenize_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()

plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

In [None]:
max_length = 400 # This was an appropriate max length for my dataset 

def generate_and_tokenize_prompt2(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2)

In [None]:
# all of sampels should be the same length
plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

# checking how does the base model work

In [None]:
eval_prompt = "How does Amazon Virtual Private Cloud (Amazon VPC) allow for the creation and management of a virtual network? #"

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

# lora

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print(model)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32, # . A higher rank will allow for more expressivity, but there is a compute tradeoff.
    lora_alpha=64, #a higher value for alpha assigns more weight to the LoRA activations.
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

Comparison:
* original paper using r=64 and lora_alpha=16 --> trainable params: 162217984 || all params: 3662630912 || trainable%: 4.429001662944519
* modified version using r=32 and lora_alpha=64 --> trainable params: 81108992 || all params: 3581521920 || trainable%: 2.264651559077991

In [None]:
#model with the LoRA adapters added:
print(model)


# model training

In [None]:
#!pip install -q wandb -U

import wandb, os
wandb.login() #you need to register account for weighs and bias, and then you can et the API key

In [None]:
wandb_project = "journal-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
#if torch.cuda.device_count() > 1: # If more than 1 GPU
#    model.is_parallelizable = True
#    model.model_parallel = True

In [None]:
import transformers
from datetime import datetime

project = "AmazonVPC-finetune"
base_model_name = "llama2-7b"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        max_steps=500,
        learning_rate=2.5e-5, # Want a small lr for finetuning
        #bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()