In [1]:
from unsloth import FastModel
import torch

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!




In [2]:
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = 4096,
    load_in_4bit = True,
    load_in_8bit = False,
)

==((====))==  Unsloth 2025.9.7: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-PCIE-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    
    r = 32,
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [4]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

from datasets import load_dataset

In [65]:
# Load from your JSONL file
dataset = load_dataset("json", data_files="data/combined_dataset.jsonl", split="train")
print("Dataset structure:")
print(dataset[0])
print("\nDataset columns:", dataset.column_names)
print("Dataset size:", len(dataset))

Dataset structure:
{'instruction': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'input': '', 'output': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

Dataset columns: ['instruction', 'input', 'output']
Dataset size: 84059


In [66]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    
    messages_list = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Combine instruction and input if input exists
        if input_text and input_text.strip():
            user_message = f"{instruction}\n\n{input_text}"
        else:
            user_message = instruction
            
        messages = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": output}
        ]
        messages_list.append(messages)
    
    texts = [
        tokenizer.apply_chat_template(
            message, 
            tokenize=False, 
            add_generation_prompt=False
        ).removeprefix('<|begin_of_text|>')
        for message in messages_list
    ]
    return {"text": texts}

In [67]:
dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/84059 [00:00<?, ? examples/s]

In [68]:
# Cell 8: Check formatted text
print("Sample formatted text:")
print(dataset[0]["text"])

Sample formatted text:
<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|eot_id|>


In [9]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mkushalramaiya[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [69]:
wandb.init(
    project="bigdata-llama-finetuning",  # Your project name
    name="llama-3.1-8b-combined-run-2",   # This run's name
    config={
        "model": "Meta-Llama-3.1-8B-Instruct",
        "dataset": "combined_dataset",
        "lora_r": 32,
        "lora_alpha": 64,
        "learning_rate": 0.00019,
        "max_steps": 25,
        "batch_size": 1,
    }
)

0,1
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñá‚ñÖ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÉ‚ñÖ‚ñÇ‚ñÇ‚ñÅ‚ñÖ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÅ‚ñà
train/learning_rate,‚ñÅ‚ñÇ‚ñÑ‚ñÖ‚ñá‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñá‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÜ

0,1
total_flos,73808773373952.0
train/epoch,0.00035
train/global_step,25.0
train/grad_norm,144.89449
train/learning_rate,0.0
train/loss,6.6018
train_loss,1.85396
train_runtime,8.6101
train_samples_per_second,2.904
train_steps_per_second,2.904


In [70]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        max_steps = 25,  # Adjust based on your dataset size
        learning_rate = 0.00019,
        logging_steps = 1,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        report_to = "wandb",
        fp16 = False,
        max_grad_norm = 0.1,
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=68):   0%|          | 0/84059 [00:00<?, ? examples/s]

In [71]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=64):   0%|          | 0/84059 [00:00<?, ? examples/s]

In [72]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.494 GB.
7.117 GB of memory reserved.


In [73]:
# Cell 12: Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 84,059 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


Step,Training Loss
1,0.7858
2,1.3858
3,1.1013
4,0.8515
5,0.2914
6,1.0366
7,0.0906
8,0.0087
9,0.0098
10,1.2952


In [74]:
# Cell 13: Calculate memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

In [75]:
# Cell 14: Print training stats
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")


8.5985 seconds used for training.
0.14 minutes used for training.
Peak reserved memory = 7.117 GB.
Peak reserved memory for training = 0.0 GB.


In [76]:
messages = [{
    "role": "user", 
    "content": "A has 10 chips, he shares half with B, how many does he have now?"
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    tokenize = False,
)


In [77]:
from transformers import TextStreamer
print("\nTesting fine-tuned model:")
print("Input prompt:", messages[0]["content"][:100] + "...")
output = model.generate(
    **tokenizer(text, return_tensors="pt").to("cuda"),
    max_new_tokens = 192,
    temperature = 1.0,
    top_p = 0.8,
    top_k = 32,
    streamer = TextStreamer(tokenizer, skip_prompt=True),
)


Testing fine-tuned model:
Input prompt: A has 10 chips, he shares half with B, how many does he have now?...
5<|eot_id|>


In [78]:
# Cell 17: Save the model
project_name="combined_llama_3.1-8b-combined-finetuned"
model.save_pretrained(f"outputs/group_project_{project_name}")
tokenizer.save_pretrained(f"outputs/group_project_{project_name}")

print(f"\nModel saved to: outputs/group_project_{project_name}")


Model saved to: outputs/group_project_combined_llama_3.1-8b-combined-finetuned


In [79]:
test_questions = [
    "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?",
    "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?",
    "If John has 15 apples and gives away 7, how many does he have left?"
]

for i, question in enumerate(test_questions, 1):
    messages = [{"role": "user", "content": question}]
    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    
    output = model.generate(
        **tokenizer(text, return_tensors="pt").to("cuda"),
        max_new_tokens=256,
        temperature=1.0,
        top_p=0.8,
        top_k=32,
    )
    
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    
    print(f"\n--- Test {i} ---")
    print(f"Question: {question}")
    print(f"Output : {output_text}")


--- Test 1 ---
Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?assistant

$6

--- Test 2 ---
Question: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?assistant

1

In [22]:
# from huggingface_hub import HfApi, login
# login()

In [60]:
from huggingface_hub import HfApi,create_repo

api = HfApi()

In [82]:
# Your model details
model_path = f"outputs/group_project_{project_name}"  # Local path to your saved model
repo_id = "KushalRamaiya/BigData_llama-3.1-8b-combined"  # Your desired repo name


In [83]:
try:
    create_repo(
        repo_id=repo_id,
        repo_type="model",
        private=False,  # Set to True for private repo
        exist_ok=True,  # Won't error if repo already exists
    )
    print(f"‚úÖ Repository created: {repo_id}")
except Exception as e:
    print(f"Note: {e}")

‚úÖ Repository created: KushalRamaiya/BigData_llama-3.1-8b-combined


In [84]:
# Step 2: Upload the entire folder
api.upload_folder(
    folder_path=model_path,
    repo_id=repo_id,
    repo_type="model",
)

print(f"‚úÖ Model uploaded to: https://huggingface.co/{repo_id}")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

‚úÖ Model uploaded to: https://huggingface.co/KushalRamaiya/BigData_llama-3.1-8b-combined
