#Code Generation

#Setup & Install Required Packages

In [None]:
!pip install -q unsloth accelerate peft trl transformers bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.7/192.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

#Load LLaMA 3.1 (8B) with Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
import transformers

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

#Define a Small Code Instruction Dataset (or use small sample from open dataset like CodeAlpaca)

In [None]:
from datasets import Dataset

data = [
    {
        "instruction": "Write a Python function to reverse a string.",
        "input": "",
        "output": "def reverse_string(s):\n    return s[::-1]"
    },
    {
        "instruction": "Write a function to check if a number is prime.",
        "input": "",
        "output": "def is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True"
    },
]

dataset = Dataset.from_list(data)


#Apply Chat Format for LLaMA 3.1

In [None]:
from unsloth.chat_templates import get_chat_template

def format(example):
    messages = [
        {"role": "user", "content": example["instruction"] + example["input"]},
        {"role": "assistant", "content": example["output"]},
    ]
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example

dataset = dataset.map(format)


#Prepare for Training

In [None]:
from transformers import TrainingArguments

FastLanguageModel.for_training(model)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    use_gradient_checkpointing=True,
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)


#Training Arguments

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=20,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="codegen_llama",
    save_strategy="no"
)


#Train the Model

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()


#Save Model

In [None]:
model.save_pretrained("llama-codegen-lora")
tokenizer.save_pretrained("llama-codegen-lora")


#Inference

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = tokenizer.apply_chat_template([
    {"role": "user", "content": "Write a Python function to calculate factorial."}
], tokenize=False, add_generation_prompt=True)

result = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
print(result[0]['generated_text'])


#Setup & Install Required Packages

In [None]:
!pip install -q unsloth accelerate peft trl transformers bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m719.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.7/192.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

#Load LLaMA 3.1 (8B) with Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
import transformers

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

#Define a Small Code Instruction Dataset (or use small sample from open dataset like CodeAlpaca)

In [None]:
from datasets import Dataset

data = [
    {
        "instruction": "Write a Python function to reverse a string.",
        "input": "",
        "output": "def reverse_string(s):\n    return s[::-1]"
    },
    {
        "instruction": "Write a function to check if a number is prime.",
        "input": "",
        "output": "def is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True"
    },
]

dataset = Dataset.from_list(data)


#Apply Chat Format for LLaMA 3.1

In [None]:
def format(example):
    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{example["instruction"]}{example["input"]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{example["output"]}<|eot_id|>"""
    example["text"] = prompt
    return example

dataset = dataset.map(format)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

#Prepare for Training

In [None]:
from transformers import TrainingArguments

FastLanguageModel.for_training(model)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    use_gradient_checkpointing=True,
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.19 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


#Training Arguments

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=20,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="codegen_llama",
    save_strategy="no"
)


#Train the Model

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 20 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 13,631,488/8,000,000,000 (0.17% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprafulcalendar[0m ([33mprafulcalendar-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,3.7674
2,3.7674
3,3.7674
4,3.7674
5,3.7674
6,3.6065
7,3.3878
8,3.1282
9,2.8029
10,2.3977


TrainOutput(global_step=20, training_loss=2.5835011184215544, metrics={'train_runtime': 225.8433, 'train_samples_per_second': 0.708, 'train_steps_per_second': 0.089, 'total_flos': 133529557401600.0, 'train_loss': 2.5835011184215544})

#Save Model

In [None]:
model.save_pretrained("llama-codegen-lora")
tokenizer.save_pretrained("llama-codegen-lora")


('llama-codegen-lora/tokenizer_config.json',
 'llama-codegen-lora/special_tokens_map.json',
 'llama-codegen-lora/tokenizer.json')

#Inference

In [None]:
prompt = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Write a Python function to calculate factorial.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("=== Assistant Response ===")
print(response.split("assistant")[-1].strip())


=== Assistant Response ===



In [None]:
import torch

# Step 1: Prompt with proper formatting
prompt = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Write a Python function to calculate factorial.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# Step 2: Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Step 3: Try to use eot_id as the end token
try:
    eos_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
    if eos_token_id is None or eos_token_id == tokenizer.unk_token_id:
        eos_token_id = tokenizer.eos_token_id
except:
    eos_token_id = tokenizer.eos_token_id

# Step 4: Generate the response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        eos_token_id=eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

# Step 5: Decode
decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)


=== Assistant Response ===

def factorial(n):
    if n == 0:
        return 1
    return n * factorial(n-1)GuidId


In [None]:
# Step 6: Extract clean assistant response
print("=== Assistant Response ===\n")
try:
    assistant_part = decoded.split("<|start_header_id|>assistant<|end_header_id|>")[1]
    # Clean junk tokens
    for token in ["<|", "user", "assistant", "content"]:
        assistant_part = assistant_part.split(token)[0]
    print(assistant_part.strip())
except:
    print("⚠️ Couldn't parse the assistant's response. Here's the raw output instead:\n")
    print(decoded)

=== Assistant Response ===

def factorial(n):
    if n == 0:
        return 1
    return n * factorial(n-1)GuidId
