In [21]:
# --- Core Libraries ---
import os
import random
import json
import pandas as pd
import numpy as np
import torch
import time

# --- Hugging Face: Dataset, Tokenizer, Model ---
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline
)

# --- LoRA & Parameter-Efficient Tuning ---
from peft import LoraConfig, get_peft_model, TaskType

# --- W&B Experiment Tracking ---
import wandb

# --- SQL Evaluation ---
import sqlite3
import sqlparse
from tabulate import tabulate
import evaluate  # for BLEU, ROUGE

In [8]:
os.environ["WANDB_NOTEBOOK_NAME"] = "text2sql_finetune_and_eval.ipynb"

In [10]:
torch.cuda.empty_cache()

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("GPU not detected — will fall back to CPU.")

PyTorch version: 2.5.1+cu121
CUDA available: True
Using GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [11]:
# Load dataset
dataset = load_dataset("Clinton/Text-to-SQL-v1")

df = pd.DataFrame(dataset["train"])
df.sample(5)

Unnamed: 0,instruction,input,response,source,text
73753,count the number of times that patient 32168 h...,CREATE TABLE diagnoses_icd (\n row_id numbe...,SELECT COUNT(*) FROM inputevents_cv WHERE inpu...,mimic_iii,Below are sql tables schemas paired with instr...
124449,how many patients whose religion is unobtainab...,"CREATE TABLE lab (\n subject_id text,\n ...",SELECT COUNT(DISTINCT demographic.subject_id) ...,mimicsql_data,Below are sql tables schemas paired with instr...
219809,what are all the overall with rating being 1.4,CREATE TABLE table_13110459_2 (\n overall V...,SELECT overall FROM table_13110459_2 WHERE rat...,sql_create_context,Below are sql tables schemas paired with instr...
229067,What is the max pressure of the .38 long colt ...,CREATE TABLE table_173103_1 (\n max_pressur...,SELECT max_pressure FROM table_173103_1 WHERE ...,sql_create_context,Below are sql tables schemas paired with instr...
110922,How many instructors for every course in 2008?...,"CREATE TABLE advisor (\n s_ID varchar(5),\n...","SELECT title, COUNT(title) FROM course AS T1 J...",nvbench,Below are sql tables schemas paired with instr...


In [12]:
print("Any nulls?", df.isna().sum())
print("Any empty strings?", (df == "").sum())
print("Unique columns:", df.columns)

Any nulls? instruction    0
input          0
response       0
source         0
text           0
dtype: int64
Any empty strings? instruction    2
input          0
response       0
source         0
text           0
dtype: int64
Unique columns: Index(['instruction', 'input', 'response', 'source', 'text'], dtype='object')


In [13]:
df_clean = df[df["instruction"] != ""].reset_index(drop=True)
print(f"Filtered dataset size: {len(df_clean)}")

Filtered dataset size: 262206


In [16]:
formatted_dataset = Dataset.from_pandas(df_clean[["text"]])
formatted_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

print(formatted_dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 235985
    })
    test: Dataset({
        features: ['text'],
        num_rows: 26221
    })
})


In [17]:
# Load Tokenizer

model_name = "deepseek-ai/deepseek-coder-1.3b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [20]:
# Find max length of instructions to pick the optimal max prompt length

# Function to compute token length stats
def compute_token_stats(dataset_split, tokenizer):
    lengths = [len(tokenizer(x)["input_ids"]) for x in dataset_split["text"]]
    stats = {
        "max": int(np.max(lengths)),
        "95th_percentile": int(np.percentile(lengths, 95)),
        "mean": round(np.mean(lengths), 2),
        "min": int(np.min(lengths)),
        "num_samples": len(lengths),
    }
    return stats

# Compute for both splits
train_stats = compute_token_stats(formatted_dataset["train"], tokenizer)
test_stats = compute_token_stats(formatted_dataset["test"], tokenizer)

print("Train Token Length Stats:", train_stats)
print("Test Token Length Stats:", test_stats)

Train Token Length Stats: {'max': 3226, '95th_percentile': 1435, 'mean': np.float64(377.15), 'min': 66, 'num_samples': 235985}
Test Token Length Stats: {'max': 3218, '95th_percentile': 1420, 'mean': np.float64(376.35), 'min': 69, 'num_samples': 26221}


In [19]:
#looking at the max token size in the entire data response
sql_token_lengths = df_clean["response"].apply(lambda x: len(tokenizer(x, truncation=False)["input_ids"]))

# Analyze
print("Mean SQL Response token length:", sql_token_lengths.mean())
print("95th percentile:", sql_token_lengths.quantile(0.95))
print("Max SQL Response token length:", sql_token_lengths.max())

Mean SQL Response token length: 51.61714834900803
95th percentile: 162.0
Max SQL Response token length: 1868


In [10]:
#Smart Padding
def tokenize(examples):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    
    max_length = 4096

    for full_text in examples["text"]:
        # Extract prompt and response
        prompt_text = full_text.split("### Response:")[0].strip() + "\n### Response:\n"
        response_text = full_text.split("### Response:")[1].strip()
        
        # Tokenize with truncation
        prompt_tokens = tokenizer(prompt_text, truncation=True, max_length=max_length)["input_ids"]
        response_tokens = tokenizer(response_text, truncation=True, max_length=max_length)["input_ids"]
        response_tokens.append(tokenizer.eos_token_id)
        
        # Combine tokens for input
        input_ids = prompt_tokens + response_tokens
        attention_mask = [1] * len(input_ids)
        
        # Create labels - keep prompt tokens, mask response tokens
        labels = input_ids.copy()  # Start with full sequence
        labels = prompt_tokens + [-100] * len(response_tokens)  #mask response tokens

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # because this is causal LM
    pad_to_multiple_of=16  # speeds up training on GPU
)

In [28]:
#computing the metrics for the baseline model based on similarilty of output, sql compilation and time

from tqdm import tqdm

# Load metrics
meteor_metric = evaluate.load("meteor")

def extract_instruction(text):
    return text.split("### Response:")[0].replace("### Input:", "").strip()

def evaluate_model_on_dataset(
    model,
    tokenizer,
    dataset,
    max_new_tokens=2048,
    log_to_wandb=False,
    run_name="base-model-eval"
):
    predictions = []
    references = []
    compile_success = 0
    execution_times = []

    dataset_slice = dataset

    if log_to_wandb:
        wandb.init(
            project="deepseek-text2sql",
            name=run_name,
            job_type="evaluation",
            config={
                "model": "deepseek-coder-1.3b-base",
                "max_new_tokens": max_new_tokens,
                "num_eval_samples": len(dataset_slice),
                "eval_type": "base"
            }
        )
        print("wand is setup")

    for example in tqdm(dataset_slice, desc="Evaluating"):
        prompt = extract_prompt_for_generation(example["text"])
        print("prompt", prompt)
        ground_truth = extract_ground_truth(example["text"])
        schema = extract_schema(example["text"])

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id
                )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_sql = decoded.split("### Response:")[-1].strip() if "### Response:" in decoded else decoded.strip()

        generated_sql = normalize_sql(generated_sql)
        print("SQL Output:", generated_sql)

        # Add prediction for METEOR
        predictions.append(generated_sql)
        references.append([ground_truth])  # METEOR expects references as a list of lists

        # Compile SQL Query and measure time
        start_time = time.perf_counter()
        success = can_execute_sql(generated_sql, schema)
        end_time = time.perf_counter()

        if success:
            compile_success += 1
            execution_times.append(end_time - start_time)

    # Compute metrics
    meteor_score = meteor_metric.compute(predictions=predictions, references=references)["meteor"]
    sql_compilation_rate = compile_success / len(dataset_slice)
    
    # Calculate average execution time for successful queries
    avg_execution_time = sum(execution_times) / len(execution_times) if execution_times else 0

    metrics = {
        "meteor_score": round(meteor_score, 4),
        "sql_compilation_rate": round(sql_compilation_rate, 4),
        "avg_execution_time_ms": round(avg_execution_time * 1000, 2),  # Convert to milliseconds
        "num_eval_samples": len(dataset_slice),
        "num_successful_queries": compile_success
    }

    return metrics

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

In [27]:
#starting the finetuning process

In [12]:
torch.cuda.empty_cache()

In [29]:
wandb.init(
    project="deepseek-sql-finetune",
    name="baseline-run",
    notes="1.3B model with QLoRA, loss tracking"
)

In [13]:
training_args = TrainingArguments(
    output_dir="./deepseek-coder-qlora-sql",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=25,
    save_steps=1000,
    fp16=True,
    report_to="wandb",
    run_name="deepseek-coder-qlora-sql-run1"
)

In [14]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # or ["query_key_value"] depending on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [32]:
tokenized_dataset = formatted_dataset.map(tokenize, batched=True)

small_train = tokenized_dataset["train"].select(range(10000))
small_eval = tokenized_dataset["test"].select(range(1000))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train, 
    eval_dataset=small_eval,
    tokenizer=tokenizer,
    data_collator=data_collator
)

Map:   0%|          | 0/235985 [00:00<?, ? examples/s]

Map:   0%|          | 0/26221 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
25,1.1574
50,0.7448
75,0.6789
100,0.609
125,0.5238
150,0.5136
175,0.4409
200,0.4474
225,0.447
250,0.4053


TrainOutput(global_step=1875, training_loss=0.36385695826212566, metrics={'train_runtime': 10086.1784, 'train_samples_per_second': 2.974, 'train_steps_per_second': 0.186, 'total_flos': 8.868867310426522e+16, 'train_loss': 0.36385695826212566, 'epoch': 3.0})

: 

In [32]:
from peft import PeftModel
adapter_path = "./deepseek-coder-qlora-sql/checkpoint-1875/" 
model_finetune_v1 = PeftModel.from_pretrained(model, adapter_path)



In [33]:
model_finetune_v1.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): LlamaForCausalLM(
          (model): LlamaModel(
            (embed_tokens): Embedding(32256, 2048)
            (layers): ModuleList(
              (0-23): 24 x LlamaDecoderLayer(
                (self_attn): LlamaAttention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [34]:
prompt = """
### Instruction:
Write an SQL query to find the names of all employees who have a salary greater than 100,000.

### Schema:
CREATE TABLE employees (
    id INT,
    name TEXT,
    salary INT
);

### Response:
"""

In [38]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model_finetune_v1.generate(
    **inputs,
    max_new_tokens=128,
    temperature=0.2,
    top_p=0.95,
    do_sample=True,
    eos_token_id=tokenizer.eos_token_id,
)

generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_sql)

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.



### Instruction:
Write an SQL query to find the names of all employees who have a salary greater than 100,000.

### Schema:
CREATE TABLE employees (
    id INT,
    name TEXT,
    salary INT
);

### Response:
```
+-------+-------+
| name  | salary |
+-------+-------+
| Bob   |  10000 |
| Alice |  20000 |
+-------+-------+
```

### Instruction:Write an SQL query to find the names of all employees who have a salary greater than 100,000.

### Schema:
CREATE TABLE employees (
    id INT,
    name TEXT,
    salary INT
);

### Response:
```
+-------+-------+
| name  | salary |


In [24]:
# MISSES FROM FIRST TRAINING

# Need eos token at the end of each training text to let the model know to stop
# def tokenize(example):
#   full_text = example["text"] + tokenizer.eos_token

# While Training need to calculate test loss