In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
model_name = 'microsoft/phi-2'
cache_dir = './Phi2_Model'
dataset = load_dataset("gretelai/synthetic_text_to_sql", cache_dir="./dataset")


In [None]:
bitsnbytes = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_dtype = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [None]:
model =  AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = cache_dir,
    quantization_config = bitsnbytes,
    device_map = 'auto',
    trust_remote_code = True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = './tokens', trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
lora_config =LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout= 0.05,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [None]:
def tokenize(batch):
    texts = [
        f'### Instruction: \n{instruction}\n### Response\n{out}'
        for instruction, out in zip(batch['sql'], batch['sql_explanation'])
    ]
    token = tokenizer(
        texts,
        padding = 'longest',
        max_length = 128,
        truncation = True,
        return_tensors = 'pt'
    )
    token['labels'] = token['input_ids'].clone()
    return token

In [None]:
tokenized_train = dataset['train'].shuffle(seed = 42).select(range(2000))
tokenized_train = tokenized_train.map(tokenize, batched=True, remove_columns=tokenized_train.column_names)
tokenized_test = dataset['test'].shuffle(seed = 42).select(range(500))
tokenized_test = tokenized_test.map(tokenize, batched=True, remove_columns=tokenized_test.column_names)


In [None]:
training = TrainingArguments(
    output_dir = './Training_Data',
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    learning_rate = 1e-4,
    num_train_epochs = 3,
    fp16 = True,
    logging_steps = 50,
    save_strategy = 'epoch',
    remove_unused_columns = False,
    label_names = ['labels']
)

In [None]:
trainer = Trainer(
    model = model,
    args = training,
    train_dataset= tokenized_train,
    processing_class = tokenizer
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./Fined-tuned-Phi2")
tokenizer.save_pretrained("./Fined-tuned-Phi2")