## Here we are fine-tuning a pre-trained model (unsloth/Llama-3.2-1B-Instruct) that is likely already quantized (or optimized) for performance, and then you are applying LoRA (Low-Rank Adaptation) during the fine-tuning process

In [1]:
!pip install torch transformers datasets unsloth trl

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting unsloth
  Downloading unsloth-2024.10.2-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting unsloth-zoo (from unsloth)
  Downloading unsloth_zoo-2024.10.3-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl.m

In [3]:
!nvidia-smi

Sun Oct 20 05:10:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
from datasets import load_dataset
import torch
import os
import re
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [9]:
# Load the tokenizer and model for seq2seq tasks
model_name = "unsloth/Llama-3.2-1B-Instruct"  # Update this to your base text-to-SQL model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [10]:
dataset = load_dataset("b-mc2/sql-create-context", split="train")

README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [11]:
dataset

Dataset({
    features: ['answer', 'question', 'context'],
    num_rows: 78577
})

In [12]:
# Apply LoRA (Low-Rank Adaptation) using PEFT
lora_config = LoraConfig(
    r=8,  # Reduce LoRA rank to save resources
    lora_alpha=16,  # Alpha parameter for LoRA
    target_modules=["q_proj", "v_proj"],  # LoRA will modify these layers
    lora_dropout=0.1,  # Dropout rate
    bias="none"  # No bias adjustment
)
model = get_peft_model(model, lora_config)

In [13]:
model

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=20

In [14]:
# Preprocessing function to tokenize inputs (context + question) and outputs (SQL answer)
def preprocess_function(examples):
    inputs = [context + " " + question for context, question in zip(examples['context'], examples['question'])]
    targets = examples['answer']  # SQL query (answer)

    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the targets (SQL queries)
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids

    # Replace padding token ID in the labels with -100, so they're ignored during loss computation
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels]

    model_inputs["labels"] = labels
    return model_inputs

In [15]:
# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./unsloth-text2sql-finetune",
    per_device_train_batch_size=2,  # Lower batch size for smaller resources
    num_train_epochs=1,
    max_steps=60,
    optim = "adamw_8bit",
    logging_steps=1,
    load_best_model_at_end=False,  # No need to load best model without evaluation
    evaluation_strategy="no",  # Disable evaluation
    report_to="none",  # Disable WandB logging
)



In [24]:
# Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

max_steps is given, it will override any value given in num_train_epochs


In [25]:
# Fine-tune the model
trainer.train()

Step,Training Loss
1,5.2877
2,4.5004


KeyboardInterrupt: 

In [48]:
def gen_sql(schema, nl_query):

  # Combine schema and query for the input
  test_input = schema + " " + nl_query

  # Tokenize the input query
  inputs = tokenizer([test_input], return_tensors="pt", padding=True, truncation=True, max_length=512)

  # Ensure the model is in evaluation mode
  model.eval()

  # Move the inputs to the same device as the model
  inputs = {key: value.to(model.device) for key, value in inputs.items()}

  with torch.no_grad():
      outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)

  # Get the predicted SQL query
  generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

  # Regex pattern to match SQL queries
  sql_pattern = r"(?i)(SELECT.*?;)"  # Match SELECT statements ending with a semicolon

  # Find all matches
  matches = re.findall(sql_pattern, generated_sql)

  # Extract the first match (if any)
  if matches:
      extracted_sql = matches[0]
      print("Extracted SQL Query:", extracted_sql)
  else:
      print("No SQL query found.")

  return extracted_sql

In [52]:
# Test the fine-tuned model on a custom natural language query with a schema
context_schema = "CREATE TABLE employee (emp_id INTEGER, dept_name varchar, teacher_name TEXT, no_of_student FLOAT, subject TEXT);"
user_query = "how many teachers belong to Math department?"

In [53]:
gen_sql(context_schema, user_query)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Extracted SQL Query: SELECT COUNT(*) FROM employee WHERE subject ='math' AND dept_name ='math department' GROUP BY teacher_name HAVING COUNT(*) > 1;


"SELECT COUNT(*) FROM employee WHERE subject ='math' AND dept_name ='math department' GROUP BY teacher_name HAVING COUNT(*) > 1;"

In [None]:
model.save_pretrained("lora_model")

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('hf_token')

In [None]:
os.environ["HF_TOKEN"]="hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXX"

In [None]:
!export "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

In [None]:
model.push_to_hub("Anoop03031988/Llama-3.2 1B_text2sql", token="hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Anoop03031988/unsloth_4bit_mistral_imdb_model/commit/1b7f5c7834a76124a14d15d8e3082a0829e97c74', commit_message='Upload model', commit_description='', oid='1b7f5c7834a76124a14d15d8e3082a0829e97c74', pr_url=None, pr_revision=None, pr_num=None)