In [1]:
# --- Core Libraries ---
import os
import random
import json
import pandas as pd
import numpy as np
import torch
import time
from tqdm import tqdm
import re

# --- Hugging Face: Dataset, Tokenizer, Model ---
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline
)

# --- LoRA & Parameter-Efficient Tuning ---
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

# --- W&B Experiment Tracking ---
import wandb

# --- SQL Evaluation ---
import sqlite3
import sqlparse
from tabulate import tabulate
import evaluate  # for BLEU, ROUGE
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sidpk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sidpk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
torch.cuda.empty_cache()

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("GPU not detected — will fall back to CPU.")

PyTorch version: 2.5.1+cu121
CUDA available: True
Using GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
# Load dataset
dataset = load_dataset("Clinton/Text-to-SQL-v1")
shuffled_dataset = dataset.shuffle(seed=42)

df = pd.DataFrame(shuffled_dataset["train"])
df.sample(5)

Unnamed: 0,instruction,input,response,source,text
39769,What is the location and total attendance for ...,CREATE TABLE table_15872814_5 (\n location_...,SELECT location_attendance FROM table_15872814...,sql_create_context,Below are sql tables schemas paired with instr...
166650,A bar chart for finding the number of the part...,"CREATE TABLE election (\n Election_ID int,\...","SELECT Governor, COUNT(Governor) FROM election...",nvbench,Below are sql tables schemas paired with instr...
160253,How many documents have expenses?,CREATE TABLE ref_budget_codes (\n budget_ty...,SELECT COUNT(*) FROM documents_with_expenses,spider,Below are sql tables schemas paired with instr...
25504,For those employees who was hired before 2002-...,CREATE TABLE departments (\n DEPARTMENT_ID ...,"SELECT JOB_ID, AVG(MANAGER_ID) FROM employees ...",nvbench,Below are sql tables schemas paired with instr...
21890,List the names of technicians in ascending ord...,"CREATE TABLE technician (\n Name VARCHAR,\n...",SELECT Name FROM technician ORDER BY Age,sql_create_context,Below are sql tables schemas paired with instr...


In [4]:
df_clean = df[df["instruction"] != ""].reset_index(drop=True)
print(f"Filtered dataset size: {len(df_clean)}")

Filtered dataset size: 262206


In [5]:
formatted_dataset = Dataset.from_pandas(df_clean[["text"]])
formatted_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

print(formatted_dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 235985
    })
    test: Dataset({
        features: ['text'],
        num_rows: 26221
    })
})


In [6]:
# Load Tokenizer

model_name = "deepseek-ai/deepseek-coder-1.3b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [7]:
#Smart Padding
def tokenize(examples):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    
    max_length = 4096

    for full_text in examples["text"]:
        # Extract prompt and response
        prompt_text = full_text.split("### Response:")[0].strip() + "\n### Response:\n"
        response_text = full_text.split("### Response:")[1].strip()
        
        # Tokenize with truncation
        prompt_tokens = tokenizer(prompt_text, truncation=True, max_length=max_length)["input_ids"]
        response_tokens = tokenizer(response_text, truncation=True, max_length=max_length)["input_ids"]
        response_tokens.append(tokenizer.eos_token_id)
        
        # Combine tokens for input
        input_ids = prompt_tokens + response_tokens
        attention_mask = [1] * len(input_ids)
        
        # Create labels - keep prompt tokens, mask response tokens
        labels = input_ids.copy()  # Start with full sequence
        labels = [-100] * len(prompt_tokens) + response_tokens #mask prompt tokens

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # because this is causal LM
    pad_to_multiple_of=16  # speeds up training on GPU
)

In [8]:
import sqlite3
import re

def fix_missing_semicolons(sql_code):
    """
    Inserts semicolons between multiple CREATE TABLE statements if missing.
    Looks for patterns like `) CREATE TABLE` and adds a semicolon between them.
    """
    return re.sub(r'\)\s*(?=CREATE TABLE)', r');\n', sql_code.strip())

def can_execute_sql(generated_sql, schema=None, verbose=True):
    """
    Check if a SQL query or script can be executed against a given schema.

    Args:
        generated_sql (str): The SQL query or script to test.
        schema (str, optional): The database schema to create before testing.
        verbose (bool, optional): Whether to print detailed errors.

    Returns:
        tuple: (bool, str) - (success status, message or error)
    """
    conn = None
    try:
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()

        # Create schema if provided
        if schema:
            try:
                schema = fix_missing_semicolons(schema)
                cursor.executescript(schema)
                conn.commit()
            except sqlite3.Error as e:
                if verbose:
                    print("Schema execution failed.")
                    print("Error:", e)
                return False

        # Execute the query or script
        try:
            if ';' in generated_sql.strip().rstrip(';'):
                cursor.executescript(generated_sql)
                return True
            else:
                cursor.execute(generated_sql)
                return True
        except sqlite3.Error as e:
            if verbose:
                print("Query execution failed.")
                print("Error:", e)
            return False

    except Exception as e:
        if verbose:
            print("General error.")
            print("Error:", e)
        return False

    finally:
        if conn:
            conn.close()

In [21]:
#computing the metrics for the baseline model based on similarilty of output, sql compilation and time

# Load metrics
meteor_metric = evaluate.load("meteor")

def extract_sql_from_output(output_text, prompt_text):
    """Extract SQL query from model output, handling various formats."""
    # Remove the prompt from the output
    sql_text = output_text[len(prompt_text):].strip()
    
    # Remove any markdown code blocks if present
    sql_text = re.sub(r'```sql\s*|\s*```', '', sql_text)
    sql_text = re.sub(r'```\s*|\s*```', '', sql_text)
    
    # Remove any trailing text after semicolon
    if ';' in sql_text:
        sql_text = sql_text.split(';')[0] + ';'
    
    return sql_text.strip()

def evaluate_model_on_dataset(
    model,
    tokenizer,
    dataset,
    max_new_tokens=2048
):
    predictions = []
    references = []
    compile_success = 0
    execution_times = []

    dataset_slice = dataset

    for example in tqdm(dataset_slice, desc="Evaluating"):
        # Extract prompt and response using the same format as tokenize function
        prompt_text = example["text"].split("### Response:")[0].strip() + "\n### Response:\n"
        ground_truth = example["text"].split("### Response:")[1].strip()
        schema = example["text"].split("### Input:")[1].split("### Response:")[0].strip()

        inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=2048,
                pad_token_id=tokenizer.eos_token_id
                )
        
        # Get the generated SQL - everything after the prompt
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        #generated_sql = extract_sql_from_output(decoded, prompt_text)
        generated_sql = decoded.split("### Response:")[-1].strip().split("###")[0]
        print("SQL Output:", generated_sql)

        # Add prediction for METEOR
        predictions.append(generated_sql)
        references.append([ground_truth])  # METEOR expects references as a list of lists

        # Compile SQL Query and measure time
        start_time = time.perf_counter()
        success = can_execute_sql(generated_sql, schema)
        end_time = time.perf_counter()

        if success:
            compile_success += 1
            execution_times.append(end_time - start_time)

    # Compute metrics
    meteor_score = meteor_metric.compute(predictions=predictions, references=references)["meteor"]
    sql_compilation_rate = compile_success / len(dataset_slice)
    
    # Calculate average execution time for successful queries
    avg_execution_time = sum(execution_times) / len(execution_times) if execution_times else 0

    metrics = {
        "meteor_score": round(meteor_score, 4),
        "sql_compilation_rate": round(sql_compilation_rate, 4),
        "avg_execution_time_ms": round(avg_execution_time * 1000, 2),  # Convert to milliseconds
        "num_eval_samples": len(dataset_slice),
        "num_successful_queries": compile_success
    }

    return metrics

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sidpk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sidpk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sidpk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

In [11]:
# Load the finetuned model from checkpoint 3750 in the correct directory
model_finetune = PeftModel.from_pretrained(
    base_model, 
    "./deepseek-coder-qlora-sql/checkpoint-3750",
    device_map="auto"
)
model_finetune.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [12]:
prompt = """	
Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. ### Instruction: What model has a launch of September 3, 2010? ### Input: CREATE TABLE table_28269 (
"Model" text,
"Launch" text,
"Code name" text,
"Transistors (million)" real,
"Die size (mm 2 )" real,
"Bus interface" text,
"Memory ( MB )" text,
"SM count" real,
"Core config 1,3" text,
"Core ( MHz )" real,
"Shader ( MHz )" real,
"Memory ( MHz )" text,
"Pixel ( GP /s)" text,
"Texture ( GT /s)" text,
"Bandwidth ( GB /s)" text,
"DRAM type" text,
"Bus width ( bit )" real,
"GFLOPS (FMA) 2" text,
"TDP (watts)" real,
"Release price (USD)" text
) ### Response:
"""

In [13]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model_finetune.generate(
                **inputs,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=2048,
                pad_token_id=tokenizer.eos_token_id
                )

generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the response part (everything after "### Response:")
generated_sql = generated_sql.split("### Response:")[-1].strip().split("###")[0]
print(generated_sql)

SELECT "Model" FROM table_28269 WHERE "Launch" = 'September 3, 2010' ORDER BY "Model" LIMIT 1


In [22]:
# Evaluate fine-tuned model performance
print("Evaluating fine-tuned model...")

# Create a test set
test_samples = formatted_dataset["test"].select(range(10))  # Using 10 samples for evaluation

# Evaluate fine-tuned model
finetuned_metrics = evaluate_model_on_dataset(
    model=model_finetune,  # Fine-tuned model
    tokenizer=tokenizer,
    dataset=test_samples,
    max_new_tokens=256
)

# Print metrics
print("\nFine-tuned Model Performance:")
print(f"{'Metric':<25} {'Value':<15}")
print("-" * 40)

for metric in ['meteor_score', 'sql_compilation_rate', 'avg_execution_time_ms']:
    value = finetuned_metrics[metric]
    print(f"{metric:<25} {value:<15.4f}")

print(f"\nNumber of samples evaluated: {finetuned_metrics['num_eval_samples']}")
print(f"Number of successful queries: {finetuned_metrics['num_successful_queries']}")

Evaluating fine-tuned model...


Evaluating:  10%|█         | 1/10 [02:24<21:43, 144.86s/it]

SQL Output: SELECT high_points FROM table_13464416_4 WHERE game = "7" ORDER BY DESC LIMIT 1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Evaluating:  20%|██        | 2/10 [04:39<18:31, 138.92s/it]

SQL Output: SELECT COUNT("Player") FROM table_1007 WHERE "Prior experience" = 'shasta h.s.' AND "Class" = 'class' 


Evaluating:  30%|███       | 3/10 [07:41<18:30, 158.62s/it]

SQL Output: SELECT MIN(charttime) FROM chartevents WHERE subject_id = 48706 AND itemid = 10000000 AND charttime > '2104' AND icustay_id IS NULL AND hadm_id IN (SELECT hadm_id FROM admissions WHERE subject_id = 48706) AND icustay_id IN (SELECT icustay_id FROM icustays WHERE subject_id = 48706) AND icustay_id IN (SELECT icustay_id FROM diagnoses_icd WHERE subject_id = 48706 AND icd9_code = '10000000' AND hadm_id IN (SELECT hadm_id FROM admissions WHERE subject_id = 48706)) AND icustay_id IN (SELECT icustay_id FROM diagnoses_icd WHERE subject_id = 48706 AND icd9_code = '10000000' AND hadm_id IN (SELECT hadm_id FROM admissions WHERE subject_id = 48706)) AND icustay_id IN (SELECT icustay_id FROM diagnoses_icd WHERE subject_id = 48706 AND icd9_code = '10000000' AND hadm_id IN (SELECT hadm_id FROM admissions WHERE subject_id = 48706)) AND icustay_id IN (SELECT icustay_id FROM diagnoses_icd WHERE subject_id = 48706 AND icd9_code = '10000000' AND hadm_id IN (SELECT hadm_id FROM admissions WHERE

Evaluating:  40%|████      | 4/10 [10:26<16:06, 161.07s/it]

SQL Output: SELECT "Height in Ft." FROM table_10263 WHERE "Player" = 'illinois' 


Evaluating:  50%|█████     | 5/10 [12:37<12:30, 150.13s/it]

SQL Output: SELECT "Military expenditures (2011, % of GDP)" FROM table_1788 WHERE "Country" = 'romania' AND "Population (2011)" = '2011' AND "GDP (nominal) (2010, US$ millions)" = '2011' AND "Military expenditures (2011, US$ millions)" = '2011' AND "Military expenditures (2011, % of GDP)" = '2011' AND "Defence expenditures, (2011, per capita)" = '2011' AND "Deployable military (2011, thousands)" = '2011' AND "Country" = 'romania' AND "Population (2011)" = '2011' AND "GDP (nominal) (2010, US$ millions)" = '2011' AND "Military expenditures (2011, % of GDP)" = '2011' AND "Defence expenditures, (2011, per capita)" = '2011' AND "Deployable military (2011, thousands)" = '2011' AND "Military expenditures (2011, US$ millions)" = '2011' AND "Military expenditures (2011, % of GDP)" = '2011' AND "Defence expenditures, (2011, per capita)" = '2011' AND "Deployable military (2011, thousands)" = '2011' AND "GDP (nominal) (2010, US$ millions)" = '2011' AND "Military expenditures (2011, US$ millions)" 

Evaluating:  60%|██████    | 6/10 [14:41<09:25, 141.33s/it]

SQL Output: SELECT DISTINCT flight.flight_id FROM flight JOIN city ON city.city_code = flight.from_airport JOIN airport ON airport.airport_code = city.state_code JOIN airport ON airport.airport_code = flight.to_airport WHERE flight.airline_code = 'PHILADELPHIA' AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AND flight.arrival_time > 1000000 AND flight.departure_time < 1000000 AN

Evaluating:  70%|███████   | 7/10 [16:47<06:48, 136.32s/it]

SQL Output: SELECT "Label" FROM table_8208 WHERE "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" = '1995-01-01' AND "Region" = 'USA' AND "Format" = 'CD' AND "Version" = '1.00' AND "Label" = 'Original CD' AND "Date" 

Evaluating:  80%|████████  | 8/10 [19:03<04:32, 136.29s/it]

SQL Output: SELECT "Youth Classification" FROM table_26425 WHERE "Aggressive Rider" = 'michael barry' AND "Stage (Winner)" = 'michael barry' AND "General classification" = 'michael barry' AND "Sprint Classification" = 'michael barry' AND "Mountains Classification" = 'michael barry' AND "Team Classification" = 'michael barry' AND "Youth Classification" = 'michael barry' AND "Stage (Winner)" = 'michael barry' AND "General classification" = 'michael barry' AND "Sprint Classification" = 'michael barry' AND "Mountains Classification" = 'michael barry' AND "Youth Classification" = 'michael barry' AND "Stage (Winner)" = 'michael barry' AND "General classification" = 'michael barry' AND "Sprint Classification" = 'michael barry' AND "Mountains Classification" = 'michael barry' AND "Youth Classification" = 'michael barry' AND "Stage (Winner)" = 'michael barry' AND "General classification" = 'michael barry' AND "Sprint Classification" = 'michael barry' AND "Mountains Classification" = 'michael ba

Evaluating:  90%|█████████ | 9/10 [21:22<02:17, 137.00s/it]

SQL Output: SELECT COUNT(*) AS "
Query execution failed.
Error: unrecognized token: """


Evaluating: 100%|██████████| 10/10 [23:35<00:00, 141.50s/it]

SQL Output: SELECT record FROM table_22669044_8 WHERE location_attendance =
Query execution failed.
Error: incomplete input

Fine-tuned Model Performance:
Metric                    Value          
----------------------------------------
meteor_score              0.4016         
sql_compilation_rate      0.3000         
avg_execution_time_ms     0.6100         

Number of samples evaluated: 10
Number of successful queries: 3



