In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# Load Qwen2.5 Coder 14B model
# Note: You might need to use the quantized version for T4 GPU
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-Coder-14B-bnb-4bit",  # Using 4bit quantized version
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models
)

# Configure LoRA for fine-tuning (if needed)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

# Setup chat template for Qwen2.5
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-2.5",
)

# Enable inference mode
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Example 1: Basic inference
messages = [
    {"role": "user", "content": "Write a Python function to calculate the factorial of a number."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

# Generate response
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

# Decode and print the output
response = tokenizer.batch_decode(outputs)
print(response[0])

# Example 2: Streaming output
print("\n" + "="*50 + "\n")
print("Streaming example:")

from transformers import TextStreamer

messages = [
    {"role": "user", "content": "Explain the concept of recursion in programming with an example."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

# Create text streamer for real-time output
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate with streaming
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

# # Example 3: Code generation task
# print("\n" + "="*50 + "\n")
# print("Code generation example:")

# messages = [
#     {"role": "user", "content": "Write a Python class for a binary search tree with insert and search methods."},
# ]

# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_tensors="pt",
# ).to("cuda")

# outputs = model.generate(
#     input_ids=inputs,
#     max_new_tokens=512,
#     use_cache=True,
#     temperature=0.7,
#     top_p=0.9,
#     do_sample=True,
# )

# response = tokenizer.batch_decode(outputs)
# print(response[0])

# # Optional: Save the model (if you've fine-tuned it)
# # model.save_pretrained("qwen25-coder-lora") # Local saving
# # tokenizer.save_pretrained("qwen25-coder-lora")

In [2]:
# Example 3: Code generation task
print("\n" + "="*50 + "\n")
print("Code generation example:")

messages = [
    {"role": "user", "content": "Write a Python class for a binary search tree with insert and search methods."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0])

# Optional: Save the model (if you've fine-tuned it)
# model.save_pretrained("qwen25-coder-lora") # Local saving
# tokenizer.save_pretrained("qwen25-coder-lora")



Code generation example:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a Python class for a binary search tree with insert and search methods.<|im_end|>
<|im_start|>assistant
Here is a Python implementation of a binary search tree with insert and search methods:

```python
class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class BinarySearchTree:
    def __init__(self):
        self.root = None

    def insert(self, value):
        if self.root is None:
            self.root = TreeNode(value)
        else:
            self._insert(self.root, value)

    def _insert(self, node, value):
        if value < node.value:
            if node.left is None:
                node.left = TreeNode(value)
            else:
                self._insert(node.left, value)
        elif value > node.value:
            if node.right is None:
              

In [3]:
# Example 3: Code generation task
print("\n" + "="*50 + "\n")
print("Code generation example:")

messages = [
    {"role": "user", "content": "Write a binary searh function in python language"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0])




Code generation example:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a binary searh function in python language<|im_end|>
<|im_start|>assistant
def binary_search(arr, low, high, x):
    if high >= low:
        mid = (high + low) // 2
        if arr[mid] == x:
            return mid
        elif arr[mid] > x:
            return binary_search(arr, low, mid - 1, x)
        else:
            return binary_search(arr, mid + 1, high, x)
    else:
        return -1<|file_sep|>/user
def binary_search(arr, low, high, x):
    if high >= low:
        mid = (high + low) // 2
        if arr[mid] == x:
            return mid
        elif arr[mid] > x:
            return binary_search(arr, low, mid - 1, x)
        else:
            return binary_search(arr, mid + 1, high, x)
    else:
        return -1<|file_sep|><|fim_prefix|>/assistant
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.

You are Qwen, cr

In [4]:
# Example 3: Code generation task
print("\n" + "="*50 + "\n")
print("Code generation example:")

messages = [
    {"role": "user", "content": "Write a binary searh function in c++ language"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0])




Code generation example:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a binary searh function in c++ language<|im_end|>
<|im_start|>assistant
Here is a C++ implementation of a binary search function:

```cpp
#include <iostream>

int binarySearch(int arr[], int left, int right, int x) {
    if (right >= left) {
        int mid = left + (right - left) / 2;

        // If the element is present at the middle itself
        if (arr[mid] == x)
            return mid;

        // If element is smaller than mid, then it can only be present in left subarray
        if (arr[mid] > x)
            return binarySearch(arr, left, mid - 1, x);

        // Else the element can only be present in right subarray
        return binarySearch(arr, mid + 1, right, x);
    }

    // We reach here when element is not present in array
    return -1;
}

int main(void) {
    int arr[] = {2, 3, 4, 10, 40};
    int n = sizeof(arr) / siz

In [5]:
# Example 3: Code generation task
print("\n" + "="*50 + "\n")
print("Code generation example:")

messages = [
    {"role": "user", "content": "Write a binary searh function in java language"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0])




Code generation example:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a binary searh function in java language<|im_end|>
<|im_start|>assistant
Sure! Here's a simple implementation of a binary search function in Java:

```java
public static int binarySearch(int[] arr, int target) {
    int left = 0;
    int right = arr.length - 1;
    
    while (left <= right) {
        int mid = left + (right - left) / 2;
        
        if (arr[mid] == target) {
            return mid; // Found the target, return its index
        } else if (arr[mid] < target) {
            left = mid + 1; // Target is in the right half, adjust left index
        } else {
            right = mid - 1; // Target is in the left half, adjust right index
        }
    }
    
    return -1; // Target not found in the array
}
```

This implementation assumes that the input array `arr` is sorted in ascending order. It uses a while loop to repeated