In [None]:
!pip install transformers peft torch ijson gdown
!pip install -U datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers.trainer_callback import EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from datasets import IterableDataset
import gdown
import json
import ijson
from decimal import Decimal
import random
import os

# Download dataset
drive_link = "your dataset link"
file_id = drive_link.split('/d/')[1].split('/')[0]
download_url = f"https://drive.google.com/uc?id={file_id}"
local_dataset_path = "dataset path"

try:
    gdown.download(download_url, local_dataset_path, quiet=False)
except Exception as e:
    print(f"Error downloading dataset: {e}")
    raise

# JSON streaming generator
def json_array_generator(file_path, fraction=0.12, train_split=0.833):
    with open(file_path, 'r') as f:
        parser = ijson.items(f, 'item')
        for item in parser:
            if random.random() < fraction:
                if random.random() < train_split:
                    yield {"item": item, "split": "train"}
                else:
                    yield {"item": item, "split": "validation"}

# Create IterableDataset
dataset = IterableDataset.from_generator(
    json_array_generator,
    gen_kwargs={"file_path": local_dataset_path}
)

# Load model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically place model on GPU if available
)
model.train()

# Custom JSON encoder for Decimal
class DecimalEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)
        return super().default(obj)

# Format example
def format_example(example):
    json_str = json.dumps(example["item"], cls=DecimalEncoder)
    prompt = f"Based on the following data:\n{json_str}\nGenerate the drone action in the format: vx, vy, vz, yaw"
    action = example["item"].get("Generated Action", {"vx": 0, "vy": 0, "vz": 0, "yaw": 0})
    response = f"{action['vx']:.2f}, {action['vy']:.2f}, {action['vz']:.2f}, {action['yaw']:.2f}"
    messages = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return {
        "text": tokenizer.apply_chat_template(messages, tokenize=False),
        "split": example["split"]
    }

# Tokenize function
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    # Mask padding tokens in labels to ignore them in loss computation
    tokenized_inputs["labels"][tokenized_inputs["attention_mask"] == 0] = -100
    return tokenized_inputs

# Process dataset
formatted_dataset = dataset.map(format_example, batched=False)
train_dataset = formatted_dataset.filter(lambda x: x["split"] == "train").select_columns(['text'])
val_dataset = formatted_dataset.filter(lambda x: x["split"] == "validation").select_columns(['text'])

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/your_fine_tuned_llm_dir",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=2000,  # Reduced from 4000 based on loss trends
    learning_rate=1e-4,
    fp16=True,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    max_grad_norm=0.5,
    dataloader_num_workers=1,
    remove_unused_columns=False,
    resume_from_checkpoint=True if os.path.exists("/content/drive/MyDrive/your_fine_tuned_llm_dir/checkpoint-####") else False,
    label_smoothing_factor=0.1,
    label_names=["labels"],  # Fix for the warning
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,  # Added data collator
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train
try:
    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
except ValueError as e:
    print(f"No valid checkpoint found or other error: {e}. Starting training from scratch.")
    trainer.train()

# Save model and tokenizer
model.save_pretrained("/content/drive/MyDrive/lora_adapters")
tokenizer.save_pretrained("/content/drive/MyDrive/lora_adapters")



Downloading...
From (original): https://drive.google.com/uc?id=1AYCccocBJsar7M_DGc-pljNM21wbU9mg
From (redirected): https://drive.google.com/uc?id=1AYCccocBJsar7M_DGc-pljNM21wbU9mg&confirm=t&uuid=8422f612-383e-4e13-bcc9-df6b3daaef92
To: /content/synthetic_data.json
100%|██████████| 404M/404M [00:03<00:00, 133MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mniloynill17[0m ([33mniloynill17-rajshahi-university-of-engineering-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,16.5202,4.024743
100,15.1465,3.51342
150,13.1586,3.06175
200,11.7515,2.84719
250,11.2103,2.772994
300,11.0214,2.743638
350,10.921,2.725133
400,10.8484,2.712803
450,10.8105,2.699375
500,10.7649,2.69511


Resume From any Given Checkpoint

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import IterableDataset
import os

# Define paths
output_dir = "/content/drive/MyDrive/your_fine_tuned_llm_dir"
checkpoint_dir = output_dir

# Load model and tokenizer
model_name = "Your Preferred LLM (eg.TinyLlama/TinyLlama-1.1B-Chat-v1.0)"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16
).to("cuda" if torch.cuda.is_available() else "cpu")

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Define a simple streaming dataset (replace with your actual dataset)
def dummy_dataset_generator():
    for i in range(1000):
        yield {"text": f"Sample text {i}"}

dataset = IterableDataset.from_generator(dummy_dataset_generator)

# Tokenization function
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Set up training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    max_steps=10000,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_num_workers=1,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Specify the checkpoint to resume from
checkpoint_to_resume = "/content/drive/MyDrive/your_finetuned_llm/checkpoint-####"  # Set to your desired checkpoint path, e.g., "checkpoint-1000"

# Resume training logic
if isinstance(checkpoint_to_resume, str) and os.path.isdir(checkpoint_to_resume):
    print(f"Resuming training from {checkpoint_to_resume}")
    trainer.train(resume_from_checkpoint=checkpoint_to_resume)
else:
    print(f"Checkpoint {checkpoint_to_resume} not found or invalid, starting from scratch.")
    trainer.train()

# Save the final model and tokenizer
model.save_pretrained("/content/drive/MyDrive/lora_adapters")
tokenizer.save_pretrained("/content/drive/MyDrive/lora_adapters")