#Quantization and Fine Tuning

In [None]:
!pip install torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops
!pip install -q auto-gptq
!pip install -q optimum

In [None]:
#optional
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
)

from trl import SFTTrainer

import random
import numpy as np
import torch


In [None]:
# Set the random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set a fixed seed value
set_seed(42)


Load quantized model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/phi-2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"


If you want to produce Alpaca1Tiny2 or Tiny1Alpaca2, load the initial model first

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/path_to_alpaca_model.pth'), strict=False)
# model.load_state_dict(torch.load('/content/drive/MyDrive/path_to_tiny_model.pth'), strict=False)

##Datasets

Alpaca

In [None]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split="train")
dataset = dataset.shuffle(seed=0)
dataset = dataset.select(range(7450))

dataset = dataset.rename_column('prompt', 'text')

Tiny codes

In [None]:
# Languages in tiny_codes
languages = [
           "C++",
           "Java",
          "Ruby",
          "Rust",
          "Bash",
         ]


Select 1490 samples from each programming language

In [None]:
from collections import defaultdict
from datasets import load_dataset, concatenate_datasets


access_token = #your access token from hugging face

dataset = load_dataset("nampdn-ai/tiny-codes", split="train", token=access_token)
dataset = dataset.shuffle(seed=0)
print(dataset)
language_count = defaultdict(int)

dataset31 = dataset.filter(lambda x: x['programming_language'] == languages[0])
dataset32 = dataset.filter(lambda x: x['programming_language'] == languages[1])
dataset33 = dataset.filter(lambda x: x['programming_language'] == languages[2])
dataset34 = dataset.filter(lambda x: x['programming_language'] == languages[3])
dataset35 = dataset.filter(lambda x: x['programming_language'] == languages[4])


Code generation

In [None]:

dataset21 = dataset31.shuffle(seed=125)
dataset21 = dataset31.select(range(1490))

dataset22 = dataset32.shuffle(seed=125)
dataset22 = dataset32.select(range(1490))

dataset23 = dataset33.shuffle(seed=125)
dataset23 = dataset33.select(range(1490))

dataset24 = dataset34.shuffle(seed=125)
dataset24 = dataset34.select(range(1490))

dataset25 = dataset35.shuffle(seed=125)
dataset25 = dataset35.select(range(1490))

train_dataset1 = concatenate_datasets([dataset21, dataset22, dataset23, dataset24, dataset25])
train_dataset1 = train_dataset1.shuffle(seed=125)

train_dataset1 = train_dataset1.select_columns(['prompt','response'])
dataset2 = train_dataset1.rename_column('prompt', 'text')

print(dataset2)

Code summarization

In [None]:

dataset31 = dataset31.shuffle(seed=125)
dataset31 = dataset31.select(range(1490, 1490*2))

dataset32 = dataset32.shuffle(seed=125)
dataset32 = dataset32.select(range(1490, 1490*2))

dataset33 = dataset33.shuffle(seed=125)
dataset33 = dataset33.select(range(1490, 1490*2))

dataset34 = dataset34.shuffle(seed=125)
dataset34 = dataset34.select(range(1490, 1490*2))

dataset35 = dataset35.shuffle(seed=125)
dataset35 = dataset35.select(range(1490, 1490*2))

train_dataset2 = concatenate_datasets([dataset31, dataset32, dataset33, dataset34, dataset35])
train_dataset2 = train_dataset2.shuffle(seed=125)


train_dataset = train_dataset.select_columns(['prompt','response'])
dataset3 = train_dataset.rename_column('response', 'text')
dataset3 = dataset3.rename_column('prompt', 'response')

print(dataset3)

Concatenation of the two subdatasets(optional)

In [None]:
#if you want to fine-tune on one phase
#dataset = concatenate_datasets([dataset2, dataset3])

##Fine-tuning process

You select the dataset and the model you want to finetune

In [None]:
training_arguments = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 8,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = False,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 500,
    logging_steps = 200,
)

# LoRA configuration
peft_config = LoraConfig(
    r=64,
    lora_alpha= 16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "out_proj"] #["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ]
)



# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,  # the model
    train_dataset=dataset,  # the dataset
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length = 150, #100, 200
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
#save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/path_to_model.pth')

In [None]:
from matplotlib import pyplot as plt

loss_values = []
for entry in trainer.state.log_history:
  if 'loss' in entry.keys():
    loss_values.append(entry['loss'])

epochs = range(len(loss_values))

plt.plot(epochs, loss_values)
plt.ylabel('Training Loss')
plt.title('Training Loss - Model Fine Tuning')
plt.show()

