In [None]:
pip install datasets peft evaluate transformers[torch] bitsandbytes trl

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/521.2 kB[0m [31m801.5 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/521.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hColl

In [None]:
pip install accelerate -U



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig

In [None]:
dataset = load_dataset("b-mc2/sql-create-context")

dataset

Downloading readme:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 78577
    })
})

In [None]:
# dataset['train'][0]

In [None]:
model_checkpoint = "stabilityai/StableBeluga-7B"
# Initialize the tokenizer and model
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=512)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.config.use_cache = False

In [None]:
model.config.quantization_config.to_dict()

In [None]:
lora_target_modules = [
    "query_key_value",
    "dense",
    "dense_h_to_4h",
    "dense_4h_to_h",
]
config = LoraConfig(
    r=16,  # attention heads
    lora_alpha=12,  # alpha scaling
    target_modules=lora_target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
import random

split_ratio = 0.8
eval_ratio = 0.2

# the 30% subset
total_examples = len(dataset["train"])
subset_size = int(total_examples * 0.2)
train_size = int(subset_size * split_ratio)
eval_size = subset_size - train_size
shuffled_indices = list(range(total_examples))
random.shuffle(shuffled_indices)
training_set = dataset["train"].select(shuffled_indices[:train_size])
evaluation_set = dataset["train"].select(
    shuffled_indices[train_size: train_size + eval_size]
)
split_dataset = DatasetDict({"train": training_set, "eval": evaluation_set})
split_dataset

In [None]:
evaluation_set

In [None]:
# hyperparameters
lr = 1e-4
batch_size = 4
num_epochs = 1
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=lr,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir="outputs",
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=12,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["eval"],
    peft_config=config,
    dataset_text_field="question",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_args,
)

In [None]:
# train model
trainer.train()

In [None]:
model.push_to_hub("samadpls/querypls-prompt2sql")
tokenizer.push_to_hub("samadpls/querypls-prompt2sql")

In [None]:
# DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'