In [None]:
!pip install transformers datasets accelerate sentencepiece -q


In [None]:
# Force a clean re-installation of the key libraries
!pip uninstall -y transformers accelerate datasets
!pip install transformers accelerate datasets

In [None]:
from huggingface_hub import notebook_login

print("Please log in to your Hugging Face account.")
notebook_login()

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

In [None]:
from datasets import load_dataset
from transformers import BartTokenizer

# --- 1. Load the dataset from the Hugging Face Hub ---
try:
    full_dataset = load_dataset("gretelai/synthetic_text_to_sql")
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Failed to load dataset. Error: {e}")

# --- 2. Create our small "smoke test" sample ---
# We'll work with just 200 examples to ensure our pipeline works quickly.
smoke_test_sample = full_dataset['train'].select(range(200))
print(f"\nCreated a smoke test sample with {len(smoke_test_sample)} examples.")

# --- 3. Load the Tokenizer ---
# We need the tokenizer that matches our model (BART). It will convert
# our text into numerical IDs that the model can understand.
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
print(f"\nTokenizer for '{model_checkpoint}' loaded.")

# --- 4. Define the Preprocessing Function ---
# This is the most important step for data preparation.
# We format the input as "Schema: [SCHEMA] | Question: [QUESTION]"
# and the output (labels) as the corresponding SQL query.
def preprocess_function(examples):
    # The 'sql_context' field holds the 'CREATE TABLE...' schema. 'sql_prompt' is the user question.
    inputs = [f"Schema: {schema} | Question: {question}" for schema, question in zip(examples['sql_context'], examples['sql_prompt'])]
    # The 'sql' field is our target.
    targets = [query for query in examples['sql']]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

    # The model expects the tokenized targets to be in the 'labels' key
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- 5. Apply the function to our smoke test sample ---
# The .map() function efficiently applies our preprocessing to all examples.
tokenized_smoke_test_sample = smoke_test_sample.map(preprocess_function, batched=True)
print("\nPreprocessing complete! Our data is now tokenized and ready for training.")

# Let's inspect one processed example to see the result
print("\n--- Example of a Processed Data Point ---")
# It now includes 'input_ids', 'attention_mask', and 'labels'
print(tokenized_smoke_test_sample[0].keys())

In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# --- 1. Load the Pre-trained Model ---
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
print("Pre-trained BART model loaded.")

# --- 2. Define Training Arguments (with your username) ---
hub_model_id = "rkgupta3/bart-base-text-to-sql-smoke-test"

training_args = TrainingArguments(
    output_dir="bart-text-to-sql-trainer",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="every_save",
)
print("\nTraining arguments configured.")

# --- 3. Create the Trainer Instance ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_smoke_test_sample,
)
print("Trainer instance created. Starting training...")

# --- 4. Launch Fine-Tuning! ---
trainer.train()
print("\n--- Training Complete! ---")

# --- 5. Manually save the tokenizer (THE FIX) ---
# This crucial step saves vocab.json and other files to the output directory.
output_dir = training_args.output_dir
tokenizer.save_pretrained(output_dir)
print(f"Tokenizer explicitly saved to {output_dir}")

# --- 6. Push everything (model + tokenizer) to the Hub ---
# Now, push_to_hub will find and upload all the necessary files.
trainer.push_to_hub("Training complete with tokenizer files!")
print(f"Model and tokenizer successfully pushed to the Hub at: https://huggingface.co/{hub_model_id}")

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# --- 1. Load your fine-tuned model and tokenizer from the Hub ---
# Replace this with the model ID from your successful training run.
# This should be the same as the 'hub_model_id' you defined earlier.
model_id = "rkgupta3/bart-base-text-to-sql-smoke-test"

print(f"Loading model '{model_id}' from the Hub...")
model = BartForConditionalGeneration.from_pretrained(model_id)
# Added revision='main'
tokenizer = BartTokenizer.from_pretrained(model_id)
print("Model and tokenizer loaded successfully!")

# --- 2. Define a sample schema and a question ---
# Let's test it with a simple schema it has likely seen during training.
# You can find more examples in the GretelAI dataset to test with.
db_schema = """CREATE TABLE artists (
  `Artist_ID` real,
  `Artist_Name` text,
  `Age` real,
  `Famous_for` text,
  `Birth_Year` real
)"""

question = "What are the names of all artists older than 25?"
print(f"\nSchema: {db_schema}")
print(f"Question: {question}")

# --- 3. Prepare the input for the model ---
# We must format the input exactly as we did during training.
prompt = f"Schema: {db_schema} | Question: {question}"

# --- 4. Generate the SQL query ---
# Tokenize the prompt and pass it to the model's generate() function.
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)

# Decode the generated token IDs back into a text string
generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n--- Generated SQL ---")
print(generated_sql)

# --- A second, more complex example ---
question_2 = "Return the name and birth year of the youngest artist."
print(f"\nQuestion: {question_2}")

prompt_2 = f"Schema: {db_schema} | Question: {question_2}"
inputs_2 = tokenizer(prompt_2, return_tensors="pt")
outputs_2 = model.generate(**inputs_2, max_length=128)
generated_sql_2 = tokenizer.decode(outputs_2[0], skip_special_tokens=True)

print("\n--- Generated SQL ---")
print(generated_sql_2)

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import BartTokenizer

# --- 1. Load the full dataset from the Hub ---
print("Loading the full gretelai/synthetic_text_to_sql dataset...")
full_dataset = load_dataset("gretelai/synthetic_text_to_sql", split='train')
print("Dataset loaded successfully!")

# --- 2. Create larger, dedicated training and test sets ---
# We'll shuffle the data to ensure our splits are random.
shuffled_dataset = full_dataset.shuffle(seed=42)

# Using 5000 for training and 1000 for testing.
train_sample_size = 5000
test_sample_size = 1000

train_dataset = shuffled_dataset.select(range(train_sample_size))
test_dataset = shuffled_dataset.select(range(train_sample_size, train_sample_size + test_sample_size))

# It's good practice to bundle them into a single DatasetDict
split_datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(f"\nCreated a training set with {len(split_datasets['train'])} examples.")
print(f"Created a test set with {len(split_datasets['test'])} examples.")

# --- 3. Load the Tokenizer ---
# We use the same tokenizer as before.
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
print(f"\nTokenizer for '{model_checkpoint}' loaded.")

# --- 4. Define the same Preprocessing Function ---
def preprocess_function(examples):
    inputs = [f"Schema: {schema} | Question: {question}" for schema, question in zip(examples['sql_context'], examples['sql_prompt'])]
    targets = [query for query in examples['sql']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- 5. Apply the function to both train and test splits ---
print("\nTokenizing the datasets... (This may take a minute)")
tokenized_datasets = split_datasets.map(preprocess_function, batched=True)
print("Tokenization complete!")

# You can inspect the result to see the structure
print("\n--- Processed Datasets Structure ---")
print(tokenized_datasets)

In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# --- 1. Load the Pre-trained Model ---
# We always start from the original pre-trained model, not our smoke-test one.
model_checkpoint = "facebook/bart-base"
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
print("Pre-trained BART model loaded.")

# --- 2. Define Training Arguments for the Full Run ---
# IMPORTANT: Replace 'your-hf-username' with your actual Hugging Face username.
hub_model_id = "rkgupta3/bart-base-text-to-sql-full" # <- NEW MODEL NAME

training_args = TrainingArguments(
    output_dir="bart-text-to-sql-trainer-full",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs-full',
    logging_steps=100,

    # Evaluation and Saving Strategy (Corrected Names)
    eval_strategy="epoch",     # <- RENAMED from evaluation_strategy
    save_strategy="epoch",     # <- Corrected name
    load_best_model_at_end=True,

    # Hub Integration
    report_to="none",
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="every_save",
)
print("\nTraining arguments configured for the full run.")

# --- 3. Create the Trainer Instance with Full Datasets ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'], # <- USE THE FULL 5k TRAINING SET
    eval_dataset=tokenized_datasets['test'],   # <- USE THE 1k TEST SET FOR EVALUATION
    tokenizer=tokenizer,                       # Pass the tokenizer to ensure it's saved correctly
)
print("Trainer instance created. Starting the full training run... 🚀")

# --- 4. Launch Fine-Tuning! ---
trainer.train()
print("\n--- Full Training Complete! ---")

# --- 5. Push the final best model to the Hub ---
# The Trainer automatically pushes the best model because of our settings.
# This final push ensures the latest version is uploaded.
trainer.push_to_hub("Full training of bart-base-text-to-sql complete!")
print(f"Model successfully pushed to the Hub at: https://huggingface.co/{hub_model_id}")

In [None]:
test_dataset = split_datasets['test']
for example in test_dataset:
    print(example)
    break

In [None]:
import sqlite3
from tqdm import tqdm
from transformers import BartForConditionalGeneration, BartTokenizer

# --- 1. Load Your Fine-Tuned Model and Tokenizer ---
# IMPORTANT: Replace this with the model ID from your successful training run.
# It should be 'your-hf-username/bart-base-text-to-sql-full'.
model_id = "rkgupta3/bart-base-text-to-sql-full"

print(f"Loading model '{model_id}' from the Hub...")
try:
    model = BartForConditionalGeneration.from_pretrained(model_id)
    tokenizer = BartTokenizer.from_pretrained(model_id)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    # Stop execution if the model can't be loaded
    raise

# --- 2. Set up the evaluation ---
# We use the non-tokenized test set because we need the raw text
test_dataset = split_datasets['test']
num_correct = 0
num_total = len(test_dataset)

print(f"\nStarting evaluation on {num_total} test examples...")

# --- 3. Loop through the test set ---
for example in tqdm(test_dataset, desc="Evaluating Execution Accuracy"):
    # Get the necessary data from the example
    context_sql = example['sql_context'] # The CREATE + INSERT statements
    question = example['sql_prompt']
    ground_truth_sql = example['sql']

    # Prepare the prompt for your model
    prompt = f"Schema: {context_sql} | Question: {question}"

    # Generate SQL from your model
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=256)
    predicted_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # --- 4. Execute and Compare ---
    try:
        # Create a temporary in-memory database
        conn = sqlite3.connect(':memory:')
        cursor = conn.cursor()

        # Populate the database with the schema and data
        cursor.executescript(context_sql)

        # Execute the ground truth query
        cursor.execute(ground_truth_sql)
        ground_truth_results = cursor.fetchall()

        # Execute the predicted query
        cursor.execute(predicted_sql)
        predicted_results = cursor.fetchall()

        # Compare results as unordered sets to handle row order differences
        if set(predicted_results) == set(ground_truth_results):
            num_correct += 1

    except Exception as e:
        # If any SQL error occurs, it's considered an incorrect prediction
        pass
    finally:
        # Ensure the connection is always closed
        if 'conn' in locals() and conn:
            conn.close()

# --- 5. Calculate and Print Final Score ---
accuracy = (num_correct / num_total) * 100
print("\n--- Evaluation Complete! ---")
print(f"Correct Predictions: {num_correct} / {num_total}")
print(f"Execution Accuracy: {accuracy:.2f}% 🎯")