In [1]:
# Install required packages
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install trl
!pip install bitsandbytes
!pip install peft

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosigna

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import bitsandbytes as bnb

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")



Using device: cpu


In [3]:
# Load dataset
DATASET_NAME = "ChrisHayduk/Llama-2-SQL-Dataset"
dataset = load_dataset(DATASET_NAME)

# Select first 1000 samples for fine-tuning
full_training_dataset = dataset["train"]
shuffled = full_training_dataset.shuffle()
training_dataset = shuffled.select(range(1000))  # Select only the first 1000 for fine tuning

print(f"Training dataset size: {len(training_dataset)}")
print(f"Sample data point: {training_dataset[0]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/631 [00:00<?, ?B/s]

(…)-00000-of-00001-922416e34c5bc71c.parquet:   0%|          | 0.00/9.11M [00:00<?, ?B/s]

(…)-00000-of-00001-6907aec719559d7d.parquet:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/70719 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/7858 [00:00<?, ? examples/s]

Training dataset size: 1000
Sample data point: {'input': 'Below is an instruction that describes a SQL generation task, paired with an input that provides further context about the available table schemas. Write SQL code that appropriately answers the request.\n\n### Instruction:\nFor the player that scored 27 goals, what years did he score them?\n\n### Input:\nCREATE TABLE table_name_62 (years VARCHAR, goals VARCHAR)\n\n### Response: ', 'output': 'SELECT years FROM table_name_62 WHERE goals = 27'}


In [4]:
# Model configuration
MODEL_NAME = "NousResearch/Llama-2-7b-hf"

# BitsAndBytes configuration for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

print("Loading model with quantization...")

Loading model with quantization...


In [5]:
# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto"
)

model.config.use_cache = True
print("Model loaded successfully!")

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Model loaded successfully!


In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Tokenizer configured!")

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Tokenizer configured!


In [7]:
# Data preprocessing function
def construct_datapoint(x):
    combined = x['input'] + x['output']
    return tokenizer(combined, padding=True)

# Apply tokenization to dataset
training_dataset = training_dataset.map(construct_datapoint)

# Print dataset info
print(training_dataset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


In [8]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "down_proj", "v_proj", "gate_proj", "o_proj", "up_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

print("LoRA config created!")

LoRA config created!


In [9]:
# Prepare model for k-bit training and apply LoRA
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)  # all layers except attention layers are frozen

print("Model prepared for LoRA training!")
print(f"Trainable parameters: {model.print_trainable_parameters()}")

Model prepared for LoRA training!
trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898
Trainable parameters: None


In [10]:
# Generation configuration
generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 256
generation_configuration.temperature = 0.7
generation_configuration.top_p = 0.9
generation_configuration.do_sample = True

print("Generation configuration set!")

Generation configuration set!


In [11]:
# Generate function for testing
def generate(prompt):
    generation_configuration.max_new_tokens = 20

    encoded = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.inference_mode():
        out = model.generate(input_ids=encoded, generation_config=generation_configuration, repetition_penalty=2)
    string_decoded = tokenizer.decode(out[0], clean_up_tokenization_spaces=True)
    print(string_decoded)

print("Generation function defined!")

Generation function defined!


In [12]:
# Training arguments
train_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # simulate a larger batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    output_dir="fine_tuning"
)

print("Training arguments configured!")

Training arguments configured!


In [14]:
# Create trainer
trainer = Trainer(
    model=model,
    train_dataset=training_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=train_arguments
)

# Disable cache for training
model.config.use_cache = False

print("Trainer created and ready!")

ValueError: fp16 mixed precision requires a GPU (not 'xla').

In [None]:
# Start training
print("Starting training...")
trainer.train()
print("Training completed!")

In [None]:
# Test the model after training
evaluation_dataset = dataset['eval'].shuffle()
sample_sql_question = evaluation_dataset[0]['input']
correct_answer = evaluation_dataset[0]['output']

print("Sample SQL Question:")
print(sample_sql_question)
print("\nExpected Answer:")
print(correct_answer)
print("\nGenerated Answer:")
generate(sample_sql_question)

In [None]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_llama_sql")
tokenizer.save_pretrained("fine_tuned_llama_sql")
