In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, Trainer, BitsAndBytesConfig, default_data_collator
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'microsoft/phi-2'
adapter_path = "./Fined-tuned-Phi2"
cache_dir = './Phi2_Model'



In [3]:
bitsnbytes = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_dtype = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [4]:
base_model =  AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = cache_dir,
    quantization_config = bitsnbytes,
    device_map = 'auto',
    trust_remote_code = True
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = './tokens', trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token



Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.32s/it]


In [5]:
temp_model =  AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = cache_dir,
    quantization_config = bitsnbytes,
    device_map = 'auto',
    trust_remote_code = True
).eval()

fine_tuned = PeftModel.from_pretrained(temp_model, adapter_path)
fine_tuned = fine_tuned.merge_and_unload().eval()

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.84s/it]


In [6]:
data = dataset = load_dataset("gretelai/synthetic_text_to_sql", cache_dir="./dataset")


In [7]:
def tokenize(batch):
    texts = [
        f'### Instruction: \n{instruction}\n### Response\n{out}'
        for instruction, out in zip(batch['sql'], batch['sql_explanation'])
    ]
    token = tokenizer(
        texts,
        padding = 'longest',
        max_length = 128,
        truncation = True,
        return_tensors = 'pt'
    )
    token['labels'] = token['input_ids'].clone()
    return token

In [8]:

tokenized_test = dataset['test'].shuffle(seed = 42).select(range(500))
tokenized_test = tokenized_test.map(tokenize, batched=True, remove_columns=['sql', 'sql_explanation'])
tokenized_test = tokenized_test.with_format('torch')


In [9]:
eval_loader = DataLoader(
    tokenized_test,
    batch_size = 8,
    collate_fn = default_data_collator  
)

In [10]:
import math
@torch.no_grad()
def compute_preplexity(model):
    losses = []
    for batch in eval_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())
    return math.exp(sum(losses) / len(losses))

In [11]:
print(f"Base Model Perplexity: {compute_preplexity(base_model)}")
print(f"Tuned Model Perplexity: {compute_preplexity(fine_tuned)}")


Base Model Perplexity: 88.48299541246114
Tuned Model Perplexity: 1.82766329721753


In [12]:


def generator(prompt, model):
    inputs = tokenizer(prompt, return_tensors="pt").to(fine_tuned.device)
    outputs = model.generate(**inputs, max_new_tokens=128)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

prompt = """ ### Instruction:
SELECT 
    c.customer_id,
    c.name,
    c.country,
    COUNT(o.order_id) AS total_orders,
    SUM(o.total_amount) AS total_spent,
    CASE
        WHEN SUM(o.total_amount) > 500 THEN 'VIP'
        WHEN SUM(o.total_amount) BETWEEN 200 AND 500 THEN 'Regular'
        ELSE 'Low'
    END AS customer_status
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.country
HAVING total_spent > 100
ORDER BY total_spent DESC;

### Response:
"""



In [14]:
generator(prompt, fine_tuned)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 ### Instruction:
SELECT 
    c.customer_id,
    c.name,
    c.country,
    COUNT(o.order_id) AS total_orders,
    SUM(o.total_amount) AS total_spent,
    CASE
        WHEN SUM(o.total_amount) > 500 THEN 'VIP'
        WHEN SUM(o.total_amount) BETWEEN 200 AND 500 THEN 'Regular'
        ELSE 'Low'
    END AS customer_status
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.country
HAVING total_spent > 100
ORDER BY total_spent DESC;

### Response:
This query lists the customers who have spent more than $100 in total, sorted by the total amount spent in descending order. It uses a CASE statement to categorize customers as VIP, Regular, or Low based on their total spending.
