In [None]:
!pip install bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Model configuration
model_name = "codellama/CodeLlama-13b-Instruct-hf"
dataset_name = "bugdaryan/sql-create-context-instruction"

# LoRA configuration
lora_config = {
    "lora_r": 64,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
}

# Training configuration
training_config = {
    "num_train_epochs": 1,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.001,
    "optim": "adamw_torch",
    "lr_scheduler_type": "cosine",
    "max_steps": -1,
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "save_steps": 0,
    "logging_steps": 25,
}

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Apply LoRA configuration
model.lora_r = lora_config["lora_r"]
model.lora_alpha = lora_config["lora_alpha"]
model.lora_dropout = lora_config["lora_dropout"]

In [None]:
# Load the dataset
dataset = load_dataset(dataset_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Reduce the dataset size to 1/10th
train_dataset = tokenized_dataset["train"].select(range(len(tokenized_dataset["train"]) // 10))

# Check if the dataset has a "test" split
if "test" in tokenized_dataset:
    eval_dataset = tokenized_dataset["test"].select(range(len(tokenized_dataset["test"]) // 10))
else:
    # If no "test" split, use a subset of the "train" split for evaluation
    eval_dataset = train_dataset.select(range(len(train_dataset) // 10))

# Create the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    **training_config
)

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the fine-tuned model
model.save_pretrained("output/Text2sql-Llama2-13b")