<a href="https://colab.research.google.com/github/rafia9005/LLM-Trainer/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LLM TRAINING**

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None # None for autodetection
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3047,
    use_rslora = False,
    loftq_config = None
)

In [None]:
from datasets import load_dataset

dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")

print(dataset.column_names)

['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation']


In [None]:
def preprocess_function(examples):
    global tokenizer
    return {
        "input_ids": tokenizer(examples["sql_prompt"], truncation=True, padding="max_length", max_length=max_seq_length)["input_ids"],
        "labels": tokenizer(examples["sql"], truncation=True, padding="max_length", max_length=max_seq_length)["input_ids"]
    }

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
batch_size = 1
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])
train_dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-4)

In [None]:
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()
num_epochs = 3

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")