# Define Variables, Parameters, and Configurations 

In [2]:
from huggingface_hub import login, whoami
hf_token = "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"

login(token=hf_token)
user_info = whoami()
print("Logged in as:", user_info["name"])

Logged in as: sdhaduk


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, logging
from peft import LoraConfig, PeftModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

model_one_name = "google/gemma-2b-it" # already instruction fine-tuned
model_two_name = "microsoft/phi-2" # focused  on code, chat and QA tasks
model_three_name = "mistralai/Mistral-7B-v0.1" # only pretrained, but much larger (7B params)

# LoRA parameters
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1

# bistandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# training argument params
output_dir = "./results"
logging_dir = "./logs"
epochs = 1
fp16 = False
bf16 = True
train_batch_size = 4
eval_batch_size = 2
max_grad_norm = 0.3
gradient_accumulation_steps = 1
lr = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
warmup_ratio = 0.03
group_by_length = True
log_steps = 50
eval_steps = 500

# SFT params
max_seq_len = 256
packing = True
device_map = {"":0}

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16 if bf16 else torch.float16,
    bnb_4bit_use_double_quant=False
)

In [5]:
training_args = SFTConfig(
    output_dir=output_dir,
    logging_dir=logging_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=eval_steps,
    logging_strategy="steps",
    logging_steps=log_steps,
    learning_rate=lr,
    weight_decay=weight_decay,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    max_seq_length=max_seq_len,
)

# Initialize Dataset

In [10]:
dataset_name = "Clinton/Text-to-sql-v1"
dataset = load_dataset(dataset_name, split="train")

dataset = dataset.select(range(25000))
print(len(dataset))

25000


In [11]:
train_set = dataset.select(range(0, 20000))        # First 20,000 for training
test_set = dataset.select(range(20000, 24000))     # Next 4,000 for testing
val_set = dataset.select(range(24000, 25000))      # Last 1,000 for validation

print(train_set[0])

{'instruction': 'Name the home team for carlton away team', 'input': 'CREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)', 'response': 'SELECT home_team FROM table_name_77 WHERE away_team = "carlton"', 'source': 'sql_create_context', 'text': 'Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. ### Instruction: Name the home team for carlton away team ### Input: CREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n) ### Response: SELECT home_team FROM table_name_77 WHERE away_team = "carlton"'}


# Load and Fine-tune Each Model 

## Gemma-2B-IT

In [35]:
gemma_model = AutoModelForCausalLM.from_pretrained(
    model_one_name,
    token=hf_token,
    quantization_config=bnb_config,
    device_map=device_map
)
gemma_model.config.pretraining_tp = 1
gemma_model.config.use_cache = False

gemma_tokenizer = AutoTokenizer.from_pretrained(
    model_one_name,
    trust_remote_code=True,
    token=hf_token
)

gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
print(gemma_model,"\n")
print(next(gemma_model.parameters()).dtype)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (n

In [37]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"]
)

In [38]:
def format_for_gemma_it(example):
    return {
        "text": f"<start_of_turn>user\n{example['instruction']}\n\n{example['input']}\n<end_of_turn>\n"
                f"<start_of_turn>model\n{example['response']}\n<end_of_turn>"
    }

gemma_train_set = train_set.map(format_for_gemma_it, remove_columns=dataset.column_names)
gemma_test_set = test_set.map(format_for_gemma_it, remove_columns=dataset.column_names)
gemma_val_set = val_set.map(format_for_gemma_it, remove_columns=dataset.column_names)

print(gemma_train_set[0]['text'])

<start_of_turn>user
Name the home team for carlton away team

CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
<end_of_turn>
<start_of_turn>model
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"
<end_of_turn>


In [39]:
def formatting_func(example):
    return example["text"]
    
trainer = SFTTrainer(
    model=gemma_model,
    args=training_args,
    train_dataset=gemma_train_set,
    eval_dataset=gemma_val_set,
    peft_config=peft_config,
    formatting_func=formatting_func,
    processing_class=gemma_tokenizer
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [40]:
print("Training Sample:\n",gemma_train_set[0]["text"],"\n\n")
input_text = "<start_of_turn>user\nName the home team for carlton away team\n\nCREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)\n<end_of_turn>"

input_ids = gemma_tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = gemma_model.generate(**input_ids, max_new_tokens=50)
print(gemma_tokenizer.decode(outputs[0]))

Training Sample:
 <start_of_turn>user
Name the home team for carlton away team

CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
<end_of_turn>
<start_of_turn>model
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"
<end_of_turn> 


<bos><start_of_turn>user
Name the home team for carlton away team

CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
<end_of_turn><eos>


In [41]:
trainer.train()

trainer.model.save_pretrained(f"{output_dir}/{model_one_name}")

Step,Training Loss,Validation Loss
500,0.7515,0.778836
1000,0.6946,0.687183
1500,0.649,0.655601
2000,0.6863,0.634724
2500,0.5977,0.616568
3000,0.6552,0.610935
3500,0.6393,0.599228
4000,0.6051,0.596571
4500,0.5789,0.590211
5000,0.6192,0.586613


In [14]:
print("Training Sample:\n",gemma_train_set[0]["text"],"\n\n")
input_text = "<start_of_turn>user\nName the home team for carlton away team\n\nCREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)\n<end_of_turn>"

input_ids = gemma_tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = gemma_model.generate(**input_ids, max_new_tokens=50)
print(gemma_tokenizer.decode(outputs[0]))

Training Sample:
 <start_of_turn>user
Name the home team for carlton away team

CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
<end_of_turn>
<start_of_turn>model
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"
<end_of_turn> 


<bos><start_of_turn>user
Name the home team for carlton away team

CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
<end_of_turn>
<start_of_turn>model
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"
<end_of_turn><eos>


## Phi-2

In [26]:
phi_model = AutoModelForCausalLM.from_pretrained(
    model_two_name,
    token=hf_token,
    quantization_config=bnb_config,
    device_map=device_map
)
phi_model.config.pretraining_tp = 1
phi_model.config.use_cache = False

phi_tokenizer = AutoTokenizer.from_pretrained(
    model_two_name,
    trust_remote_code=True,
    token=hf_token
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
print(phi_model,"\n")
print(next(phi_model.parameters()).dtype)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): 

In [28]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"]
)

In [24]:
MAX_CONTEXT_LEN = 2048

def format_and_tokenize_for_phi2(example):
    text = f"Instruct: {example['instruction']}\nInput: {example['input']}\nOutput: {example['response']}"
    tokenized = phi_tokenizer(
        text,
        truncation=True,
        max_length=MAX_CONTEXT_LEN,
        padding=False,  
        return_tensors=None,
    )
    return tokenized

phi2_train_set = train_set.map(format_and_tokenize_for_phi2, remove_columns=train_set.column_names)
phi2_val_set = val_set.map(format_and_tokenize_for_phi2, remove_columns=val_set.column_names)
phi2_test_set = test_set.map(format_and_tokenize_for_phi2, remove_columns=test_set.column_names)

print(phi2_train_set[0]['input_ids'])

[43993, 25, 6530, 262, 1363, 1074, 329, 1097, 75, 1122, 1497, 1074, 198, 20560, 25, 29244, 6158, 43679, 3084, 62, 3672, 62, 3324, 357, 198, 50284, 11195, 62, 15097, 569, 31315, 1503, 11, 198, 50284, 8272, 62, 15097, 569, 31315, 1503, 198, 8, 198, 26410, 25, 33493, 1363, 62, 15097, 16034, 3084, 62, 3672, 62, 3324, 33411, 1497, 62, 15097, 796, 366, 66, 7063, 1122, 1]


In [29]:
trainer = SFTTrainer(
    model=phi_model,
    args=training_args,
    train_dataset=phi2_train_set,
    eval_dataset=phi2_val_set,
    peft_config=peft_config,
    processing_class=phi_tokenizer
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [34]:
from torch.cuda.amp import autocast

# Construct input
instruction = "Name the home team for carlton away team"
input_sql = "CREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)"
input_text = f"Instruct: {instruction}\nInput: {input_sql}\nOutput:"

# Tokenize
inputs = phi_tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate with AMP autocast
with autocast(dtype=torch.bfloat16):
    outputs = phi_model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,
    )

# Decode output
print(phi_tokenizer.decode(outputs[0], skip_special_tokens=True))

  with autocast(dtype=torch.bfloat16):
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruct: Name the home team for carlton away team
Input: CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
Output: CREATE TABLE table_name_77 (
    home_team VARCHAR(255),
    away_team VARCHAR(255)
)



In [35]:
trainer.train()

trainer.model.save_pretrained(f"{output_dir}/{model_two_name}")

Step,Training Loss,Validation Loss
500,1.0991,1.130019
1000,0.9712,0.956167
1500,0.8878,0.92376
2000,0.8832,0.817331
2500,0.7892,0.778143
3000,0.8132,0.747285
3500,0.7702,0.723924
4000,0.7283,0.710765
4500,0.7045,0.697268
5000,0.7267,0.686854


In [36]:
# Generate with AMP autocast
with autocast(dtype=torch.bfloat16):
    outputs = phi_model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,
    )

# Decode output
print(phi_tokenizer.decode(outputs[0], skip_special_tokens=True))

  with autocast(dtype=torch.bfloat16):
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruct: Name the home team for carlton away team
Input: CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
Output: SELECT home_team FROM table_name_77 WHERE away_team = "carlton" AND home_team = "carlton"
Output: SELECT home_team FROM table_name_77 WHERE away_team = "c


## Mistral-7B

In [6]:
mistral_model = AutoModelForCausalLM.from_pretrained(
    model_three_name,
    token=hf_token,
    quantization_config=bnb_config,
    device_map=device_map
)
mistral_model.config.pretraining_tp = 1
mistral_model.config.use_cache = False

mistral_tokenizer = AutoTokenizer.from_pretrained(
    model_three_name,
    trust_remote_code=True,
    token=hf_token
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [7]:
print(mistral_model,"\n")
print(next(mistral_model.parameters()).dtype)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [9]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[ "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"]
)

In [31]:
def format_for_mistral_basic(example):
    instruction = example["instruction"].strip()
    input_text = example["input"].strip()
    response = example["response"].strip()

    if input_text:
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{response}"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{response}"

    return {"text": prompt}


mistral_train_set = train_set.map(format_for_mistral_basic, remove_columns=train_set.column_names)
mistral_train_set = mistral_train_set.select(range(5000))

mistral_test_set = test_set.map(format_for_mistral_basic, remove_columns=test_set.column_names)
mistral_val_set = val_set.map(format_for_mistral_basic, remove_columns=val_set.column_names)
mistral_val_set = mistral_val_set.select(range(100))

print(mistral_train_set[0]["text"])

### Instruction:
Name the home team for carlton away team

### Input:
CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)

### Response:
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"


In [32]:
print(len(mistral_train_set))
print(len(mistral_val_set))

5000
100


In [33]:
def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=mistral_model,
    args=training_args,
    train_dataset=mistral_train_set,
    eval_dataset=mistral_val_set,
    peft_config=peft_config,
    processing_class=mistral_tokenizer,
    formatting_func=formatting_func,
)

Applying formatting function to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/5000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/100 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [22]:
from torch import autocast 

# Print one training sample
print("Training Sample:\n", mistral_train_set[0]["text"], "\n\n")

# Example input
instruction = "Name the home team for carlton away team"
input_text = """CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)"""

prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""

# Tokenize
inputs = mistral_tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate
with torch.no_grad():
    with autocast(device_type="cuda", dtype=torch.bfloat16):  # or torch.float16 if your model uses fp16
        outputs = mistral_model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=mistral_tokenizer.eos_token_id,
        )

# Decode
response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Model Output:\n", response)

Training Sample:
 ### Instruction:
Name the home team for carlton away team

### Input:
CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)

### Response:
SELECT home_team FROM table_name_77 WHERE away_team = "carlton" 


Model Output:
 ### Instruction:
Name the home team for carlton away team

### Input:
CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)

### Response:

INSERT INTO table_name_77 VALUES ('Melbourne', 'Carlton')

### Explanation:

This is the sixth entry in a sequence of seven questions in the same database.

The


In [34]:
trainer.train()

trainer.model.save_pretrained(f"{output_dir}/{model_three_name}")

Step,Training Loss,Validation Loss
500,0.5756,0.638401
1000,0.5398,0.576986
