In [None]:
!pip install datasets trl bitsandbytes peft
!pip install transformers accelerate -U

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from datasets import load_dataset

DATASET_NAME = 'ChrisHayduk/Llama-2-SQL-Dataset'
dataset = load_dataset(DATASET_NAME)

In [None]:
full_training_dataset = dataset['train']
# randomize (not in order) by shuffling - to select first 1000 samples below to represent the whole dataset
shuffled = full_training_dataset.shuffle()
training_dataset = shuffled.select(range(1000))

In [None]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # normalized floating 4 bit
    bnb_4bit_compute_dtype="float16"
)

In [None]:
import transformers
from transformers import AutoModelForCausalLM

# Meta one is gated via licensing - so use this one instead

MODEL_NAME = 'NousResearch/Llama-2-7b-hf'

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto" # switch between cpu and gpu automatically
)

model.config.use_cache = True

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"

In [None]:
def concat_data_io(x):
  concated = x['input'] + x['output']
  #  text input must be of type `str` (single example),
  # `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
  return tokenizer(concated, padding=True)


In [None]:
concat_data_io(training_dataset[0])
training_dataset = training_dataset.map(concat_data_io)

In [None]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)

peft_config = LoraConfig(
    r=16, # rank, higher value means closer to fine tuning all parameters. lower value of R, faster fine tuning, not greatest results cos not fine tuning as many parameters
    lora_alpha=32, # scaling factore used in matrix multiplication
    # which layers we want to apply LoRA to.
    # anything not in target_modules will be frozen
    target_modules=[
        'q_proj', # query projection
        'k_proj', # key projection
        'down_proj', # part of feed forward layer nn
        'v_proj', # value projection
        'gate_proj', # part of feed forward layer nn
        'o_proj',
        'up_proj' # part of feed forward layer nn
    ],
    lora_dropout=0.05, # to prevent overfitting - every iterations of training, turn off some of the nodes (set nodes to be 0)
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config) # all layers except attention layers are frozen


In [None]:
generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 256
generation_configuration.temperature = 0.7
generation_configuration.top_p = 0.9
generation_configuration.do_sample = True

In [None]:
def complete(prompt):
  generation_configuration.max_new_tokens = 20
  encoded = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt").to(device)
  with torch.inference_mode():
    out = model.generate(
        input_ids=encoded,
        generation_config=generation_configuration,
        repetition_penalty=2.0)
  string_decoded = tokenizer.decode(out[0], clean_up_tokenization_spaces=True)
  print(string_decoded)

In [None]:
complete('Hello World, ')

In [None]:
train_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4, # simulate a larger batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    output_dir="fine_tuning"

)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=training_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=train_arguments
)

model.config.use_cache = False # for training

In [None]:
trainer.train()

In [None]:
evaluation_dataset = dataset['eval'].shuffle()

sample_sql_question = evaluation_dataset[0]['input']
correct_answer = evaluation_dataset[0]['output']

complete(sample_sql_question)

In [None]:
correct_answer

In [None]:
import transformers

FINE_TUNED_MODEL_NAME='Llama-2-7b-multiple-experts-hf'
model.save_pretrained(FINE_TUNED_MODEL_NAME)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Replace 'your_model_name' with the desired name for your model
model = AutoModelForCausalLM.from_pretrained(FINE_TUNED_MODEL_NAME, push_to_hub=True)
tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_NAME)

# Set the repo owner and private key (if you want to make it public, set `private_key=None`)
model.push_to_hub(FINE_TUNED_MODEL_NAME, repo=f'saiwaimaung/{FINE_TUNED_MODEL_NAME}', private_key=None)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(f'saiwaimaung/{FINE_TUNED_MODEL_NAME}')
tokenizer = AutoTokenizer.from_pretrained(f'saiwaimaung/{FINE_TUNED_MODEL_NAME}')