In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'microsoft/phi-2'
cache_dir = './Phi2_Model'
dataset = load_dataset("gretelai/synthetic_text_to_sql", cache_dir="./dataset")


In [3]:
bitsnbytes = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_dtype = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [4]:
model =  AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = cache_dir,
    quantization_config = bitsnbytes,
    device_map = 'auto',
    trust_remote_code = True
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.98s/it]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = './tokens', trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token


In [6]:
lora_config =LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout= 0.05,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [7]:
def tokenize(batch):
    texts = [
        f'### Instruction: \n{instruction}\n### Response\n{out}'
        for instruction, out in zip(batch['sql'], batch['sql_explanation'])
    ]
    token = tokenizer(
        texts,
        padding = 'max_length',
        max_length = 128,
        truncation = True,
        return_tensors = 'pt'
    )
    token['labels'] = token['input_ids'].clone()
    return token

In [8]:
tokenized_train = dataset['train'].shuffle(seed = 42).select(range(1500))
tokenized_train = tokenized_train.map(tokenize, batched=True, remove_columns=tokenized_train.column_names)
tokenized_test = dataset['test'].shuffle(seed = 42).select(range(500))
tokenized_test = tokenized_test.map(tokenize, batched=True, remove_columns=tokenized_test.column_names)


Map: 100%|██████████| 1500/1500 [00:00<00:00, 3019.77 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 7203.71 examples/s]


In [9]:
training = TrainingArguments(
    output_dir = './Training_Data',
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    learning_rate = 1e-3,
    num_train_epochs = 2,
    fp16 = True,
    logging_steps = 50,
    save_strategy = 'epoch',
    remove_unused_columns = False,
    label_names = ['labels']
)

In [10]:
trainer = Trainer(
    model = model,
    args = training,
    train_dataset= tokenized_train,
    processing_class = tokenizer
)

In [11]:
trainer.train()

Step,Training Loss
50,1.1468
100,0.7572
150,0.7474
200,0.7069
250,0.6827
300,0.6863
350,0.7349
400,0.6871
450,0.7294
500,0.7079


TrainOutput(global_step=1500, training_loss=0.6623961791992188, metrics={'train_runtime': 1207.8299, 'train_samples_per_second': 2.484, 'train_steps_per_second': 1.242, 'total_flos': 6114481274880000.0, 'train_loss': 0.6623961791992188, 'epoch': 2.0})

In [12]:
model.save_pretrained("./Fined-tuned-Phi2")
tokenizer.save_pretrained("./Fined-tuned-Phi2")

('./Fined-tuned-Phi2\\tokenizer_config.json',
 './Fined-tuned-Phi2\\special_tokens_map.json',
 './Fined-tuned-Phi2\\vocab.json',
 './Fined-tuned-Phi2\\merges.txt',
 './Fined-tuned-Phi2\\added_tokens.json',
 './Fined-tuned-Phi2\\tokenizer.json')