In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-o6utsech/unsloth_02cd51e366994a139547c6c9118b5ce3
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-o6utsech/unsloth_02cd51e366994a139547c6c9118b5ce3
  Resolved https://github.com/unslothai/unsloth.git to commit 6c234d5a66adb76b9b93fb0f2445648199d88e66
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.3.17 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3


In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import os
from huggingface_hub import login
import gc

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Classification using Chat Templates

In [None]:
model_name = "unsloth/tinyllama-bnb-4bit"
max_seq_length = 1024 # Can be smaller for classification tasks
dtype = None
load_in_4bit = True
output_directory = "tinyllama_classification_adapters"
dataset_subset_size = 1200
training_max_steps = 40

print(f"Configuration:")
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  Dataset Size: {dataset_subset_size}")
print(f"  Max Steps: {training_max_steps}")
print(" Configuration Set ")

Configuration:
  Model: unsloth/tinyllama-bnb-4bit
  Max Seq Length: 1024
  Dataset Size: 1200
  Max Steps: 40
 Configuration Set 


In [None]:
import time
start_time = time.time()
print(f"Loading model ({model_name})...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")

chatml_template = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}"""
tokenizer.chat_template = chatml_template
print("Manually set tokenizer.chat_template to ChatML format.")

Loading model (unsloth/tinyllama-bnb-4bit)...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Model loaded in 32.08s.
Manually set tokenizer.chat_template to ChatML format.


In [None]:
print("Configuring LoRA...")
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Smaller rank might be sufficient for simpler tasks
    lora_alpha = 16,
    lora_dropout = 0, # Use 0 for fastest patching
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)
print("LoRA configured:")
print(model.print_trainable_parameters())
print(" LoRA Configuration Complete ")

Configuring LoRA...


Unsloth 2025.3.19 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


LoRA configured:
trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701
None
 LoRA Configuration Complete 


In [None]:
from datasets import ClassLabel

print("Loading IMDB dataset...")
dataset = load_dataset("imdb", split="train")

# Create a smaller, shuffled subset for faster processing
dataset = dataset.shuffle(seed=42).select(range(dataset_subset_size))

print(f"Loaded {len(dataset)} examples.")
print("Dataset features:", dataset.features)
# We need to map the numeric label (0, 1) to words ("negative", "positive")

# Get the label mapping from the dataset features
label_map = {0: "negative", 1: "positive"}
print("Label mapping:", label_map)
print("First example:", dataset[0])

Loading IMDB dataset...


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Loaded 1200 examples.
Dataset features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
Label mapping: {0: 'negative', 1: 'positive'}
First example: {'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}


In [None]:
# Define the chat template structure (TinyLlama often uses ChatML style)
# Unsloth's template handling is good, but we define explicitly for clarity.
# Check TinyLlama's default template if needed: print(tokenizer.chat_template)

def format_classification_prompt(examples):
    texts_in = examples["text"]
    labels_in = examples["label"]
    formatted_texts = []
    for text_data, label_data in zip(texts_in, labels_in):
        label_word = label_map[label_data] # Convert 0/1 to "negative"/"positive"
        messages = [
            # System prompt is optional but good practice
            {"role": "system", "content": "You are a sentiment classifier. Respond with only 'positive' or 'negative'."},
            {"role": "user", "content": f"Classify the sentiment of the following movie review:\n\nReview: {text_data}"},
            {"role": "assistant", "content": label_word} # The target output word
        ]
        # Use tokenizer's chat template
        try:
            # `add_generation_prompt=False` as we provide the full target answer
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            formatted_texts.append(formatted)
        except Exception as e:
            print(f"Error applying template: {e}")
            formatted_texts.append("") # Append empty on error

    return {"formatted_text": formatted_texts}

print("Formatting function defined.")

Formatting function defined.


In [None]:
print("Applying formatting...")
dataset = dataset.map(format_classification_prompt, batched=True, remove_columns=list(dataset.features))
print("Formatting applied.")

# Check the first formatted example
if 'formatted_text' in dataset.features and len(dataset) > 0:
    print("\nExample formatted text (first 500 chars):")
    print(dataset[0]['formatted_text'][:500])
else:
     print("\nWarning: 'formatted_text' column missing or dataset empty.")

Applying formatting...


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Formatting applied.

Example formatted text (first 500 chars):
<|im_start|>system
You are a sentiment classifier. Respond with only 'positive' or 'negative'.<|im_end|>
<|im_start|>user
Classify the sentiment of the following movie review:

Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities...


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="formatted_text", # Use the new column name
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # Packing not typically needed for classification like this

    args=TrainingArguments(
        per_device_train_batch_size=4, # Adjust based on GPU memory
        gradient_accumulation_steps=4, # Effective batch size = 16
        warmup_steps=5,
        max_steps=training_max_steps, # Train for a fixed number of steps
        # num_train_epochs = 1, # Alternative: train for epochs
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=5,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_directory,
        save_strategy="steps",
        save_steps=25, # Save checkpoint midway
        report_to="tensorboard",
    ),
)
print("Trainer configured.")

Map (num_proc=2):   0%|          | 0/1200 [00:00<?, ? examples/s]

Trainer configured.


In [None]:
print("Starting classification fine-tuning...")
gc.collect()
torch.cuda.empty_cache()
start_train_time = time.time()
trainer.train()
end_train_time = time.time()
print(f"Training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
print(" Training Complete ")

Starting classification fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,200 | Num Epochs = 1 | Total steps = 40
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,307,840/4,000,000,000 (0.16% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,2.7783
10,2.7129
15,2.6909
20,2.7002
25,2.7186
30,2.7401
35,2.7431
40,2.751


Training finished in 3.20 minutes.
 Training Complete 


In [None]:
final_adapter_dir = f"{output_directory}/final_adapters"
print(f"Saving final LoRA adapters to: {final_adapter_dir}")
model.save_pretrained(final_adapter_dir)
tokenizer.save_pretrained(final_adapter_dir)
print("Adapters saved.")

Saving final LoRA adapters to: tinyllama_classification_adapters/final_adapters
Adapters saved.


In [None]:
print("Running Classification Inference Test...")
import warnings
warnings.filterwarnings("ignore")

FastLanguageModel.for_inference(model)
model.eval()

# Test review
test_review = "This movie was absolutely fantastic! The acting was superb, the plot was engaging, and the ending was perfect. I loved it."
# test_review = "What a waste of time. The plot made no sense and the acting was wooden. Terrible film."

# Format using the SAME template structure, but ONLY user input
messages = [
    {"role": "system", "content": "You are a sentiment classifier. Respond with only 'positive' or 'negative'."},
    {"role": "user", "content": f"Classify the sentiment of the following movie review:\n\nReview: {test_review}"}
    # NO assistant message - model must generate this
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Generation parameters - we want a short, specific answer
generation_params = {
    "max_new_tokens": 5, # Should only need 1 token for "positive" or "negative" + potential space/EOS
    "use_cache": True,
    "do_sample": False, # We want the most likely classification word
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

print("\nGenerating classification...")
with torch.no_grad():
    outputs = model.generate(inputs, **generation_params)

response_tokens = outputs[0][len(inputs[0]):]
response = tokenizer.decode(response_tokens, skip_special_tokens=True).strip()

print(f"\nReview:\n{test_review}")
print(f"\nPredicted Sentiment: '{response}'") # Should output 'positive' or 'negative'

gc.collect()
torch.cuda.empty_cache()
print(" Inference Test Complete")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Running Classification Inference Test...

Generating classification...

Review:
This movie was absolutely fantastic! The acting was superb, the plot was engaging, and the ending was perfect. I loved it.

Predicted Sentiment: 'Classify the sentiment of'
 Inference Test Complete


# Conversational Chat



In [None]:
# Using TinyLlama for speed
model_name = "unsloth/tinyllama-bnb-4bit"
max_seq_length = 1024 # Adjust as needed for conversations
dtype = None
load_in_4bit = True
output_directory = "tinyllama_dolly_chat_adapters" # << CHANGE
# Use a very small subset and few steps for speed
dataset_subset_size = 1000 # Number of examples from Dolly
training_max_steps = 75 # Train for a fixed number of steps (slightly more than classification)

print(f"Configuration:")
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  Dataset Size: {dataset_subset_size}")
print(f"  Max Steps: {training_max_steps}")
print(" Configuration Set ")

Configuration:
  Model: unsloth/tinyllama-bnb-4bit
  Max Seq Length: 1024
  Dataset Size: 1000
  Max Steps: 75
 Configuration Set 


In [None]:
import time
start_time = time.time()
print(f"Loading model ({model_name})...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")
print(" Model and Tokenizer Loaded ")

#  Add this block right after loading the tokenizer in Cell 4

# Manually set the ChatML template for TinyLlama tokenizer
# Reference: https://huggingface.co/docs/transformers/main/en/chat_templating
chatml_template = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}"""
tokenizer.chat_template = chatml_template
print("Manually set tokenizer.chat_template to ChatML format.")

Loading model (unsloth/tinyllama-bnb-4bit)...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded in 5.33s.
 Model and Tokenizer Loaded 
Manually set tokenizer.chat_template to ChatML format.


In [None]:
print("Configuring LoRA...")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Maybe slightly higher rank for general chat
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)
print("LoRA configured:")
print(model.print_trainable_parameters())
print("LoRA Configuration Complete")

Configuring LoRA...
LoRA configured:
trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338
None
LoRA Configuration Complete


In [None]:
print("Loading Databricks Dolly dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Create a smaller, shuffled subset
dataset = dataset.shuffle(seed=42).select(range(dataset_subset_size))

print(f"Loaded {len(dataset)} examples.")
print("Dataset features:", dataset.features)
# Columns: instruction, context, response, category
print("First example:", dataset[0])
print(" Dataset Loaded")

Loading Databricks Dolly dataset...


README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Loaded 1000 examples.
Dataset features: {'instruction': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'response': Value(dtype='string', id=None), 'category': Value(dtype='string', id=None)}
First example: {'instruction': 'Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?', 'context': '', 'response': 'Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan Gold-Tree', 'category': 'open_qa'}
 Dataset Loaded


In [None]:
# Dolly has instruction, context, response. We format as user/assistant turns.
# Context can be prepended to the instruction or put in system prompt.

def format_chat_prompt(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    formatted_texts = []

    for instruction, context, response in zip(instructions, contexts, responses):
        user_content = instruction
        # Prepend context to the user instruction if it exists
        if context and context.strip():
            user_content = f"Context: {context.strip()}\n\nInstruction: {instruction.strip()}"

        messages = [
            {"role": "system", "content": "You are a helpful and friendly assistant."},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": response} # The target response
        ]

        try:
            # add_generation_prompt=False as we provide the full turn
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            formatted_texts.append(formatted)
        except Exception as e:
            print(f"Error applying template: {e}")
            formatted_texts.append("")

    # Use a different output column name to avoid clash if rerunning cells
    return {"chat_formatted": formatted_texts}

print("Formatting function defined.")
print(" Formatting Function Defined ")

Formatting function defined.
 Formatting Function Defined 


In [None]:
print("Applying chat formatting...")
dataset = dataset.map(format_chat_prompt, batched=True, remove_columns=list(dataset.features))
print("Formatting applied.")

if 'chat_formatted' in dataset.features and len(dataset) > 0:
    print("\nExample formatted text (first 500 chars):")
    print(dataset[0]['chat_formatted'][:500])
else:
     print("\nWarning: 'chat_formatted' column missing or dataset empty.")
print(" Formatting Applied ")

Applying chat formatting...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatting applied.

Example formatted text (first 500 chars):
<|im_start|>system
You are a helpful and friendly assistant.<|im_end|>
<|im_start|>user
Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?<|im_end|>
<|im_start|>assistant
Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan
 Formatting Applied 


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="chat_formatted", # << Use the new column name
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True, # Packing can be beneficial for chat datasets

    args=TrainingArguments(
        per_device_train_batch_size=2, # Lower batch size often needed with packing/longer sequences
        gradient_accumulation_steps=8, # Effective batch size 16
        warmup_steps=5,
        max_steps=training_max_steps, # Use max steps
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_directory,
        save_strategy="steps",
        save_steps=40, # Adjust save steps
        report_to="tensorboard",
    ),
)
print("Trainer configured.")
print(" Trainer Configured ")

Generating train split: 0 examples [00:00, ? examples/s]

Trainer configured.
 Trainer Configured 


In [None]:
print("Running Conversational Inference Test...")
import warnings
warnings.filterwarnings("ignore")

FastLanguageModel.for_inference(model)
model.eval()

# Example conversation history
messages = [
    {"role": "system", "content": "You are a helpful and friendly assistant."},
    {"role": "user", "content": "What are the main benefits of using renewable energy sources?"},
    # Add a placeholder assistant response if needed by the template for multi-turn,
    # but usually apply_chat_template handles the prompt correctly for the *next* turn.
    # We expect the model to generate the assistant's response here.
]

# Format the prompt for the *next* assistant turn
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Generation parameters
generation_params = {
    "max_new_tokens": 150, # Allow for a longer conversational response
    "use_cache": True,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

print("\nGenerating response...")
with torch.no_grad():
    outputs = model.generate(inputs, **generation_params)

response_tokens = outputs[0][len(inputs[0]):]
response = tokenizer.decode(response_tokens, skip_special_tokens=True).strip()

# Display conversation
print("\n--- Conversation ---")
for msg in messages:
    print(f"{msg['role'].capitalize()}: {msg['content']}")
print(f"Assistant: {response}") # Print the generated response

gc.collect()
torch.cuda.empty_cache()
print("\n Inference Test Complete ")

Running Conversational Inference Test...

Generating response...

--- Conversation ---
System: You are a helpful and friendly assistant.
User: What are the main benefits of using renewable energy sources?
Assistant: <|im_end|>
Renewable energy sources are beneficial because they are clean and they help the environment.
Renewable energy sources are beneficial because they are clean and they help the environment.
Renewable energy sources are beneficial because they are clean and they help the environment.<|im_end|>
<|im_start|>user
<|im_end|>
Renewable energy sources are beneficial because they are clean and they help the environment.<|im_end|>
What are the main benefits of using fossil fuels?<|im_end|>
<|im_start|>assistant
<|im

 Inference Test Complete 


# Extending Max Context Size (TinyLlama)

In [None]:
# Using TinyLlama
model_name = "unsloth/tinyllama-bnb-4bit"
# *** KEY CHANGE: Define desired extended context length ***
max_seq_length = 4096 # Extend beyond TinyLlama's default (usually 2048)
dtype = None
load_in_4bit = True
output_directory = "tinyllama_extended_context_adapters" # << CHANGE
# Use small subset & steps
dataset_subset_size = 1000
training_max_steps = 50 # Keep training short for demo

print(f"Configuration:")
print(f"  Model: {model_name}")
print(f"  !!! Extended Max Seq Length: {max_seq_length} !!!") # Highlight the change
print(f"  Dataset Size: {dataset_subset_size}")
print(f"  Max Steps: {training_max_steps}")
print(" Configuration Set ")

Configuration:
  Model: unsloth/tinyllama-bnb-4bit
  !!! Extended Max Seq Length: 4096 !!!
  Dataset Size: 1000
  Max Steps: 50
 Configuration Set 


In [None]:
start_time = time.time()
print(f"Loading model ({model_name}) with EXTENDED max_seq_length={max_seq_length}...")
# *** KEY: Pass extended max_seq_length HERE ***
# Unsloth automatically handles RoPE scaling adjustments for supported models.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length, # Pass the extended length
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # Unsloth might print messages about RoPE scaling if it adjusts it.
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")

#  Add this block right after loading the tokenizer
# Manually set the ChatML template for TinyLlama tokenizer
# Reference: https://huggingface.co/docs/transformers/main/en/chat_templating
chatml_template = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}"""
tokenizer.chat_template = chatml_template
print("Manually set tokenizer.chat_template to ChatML format.")
# =

print(" Model and Tokenizer Loaded (Extended Context) ")

Loading model (unsloth/tinyllama-bnb-4bit) with EXTENDED max_seq_length=4096...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!


Model loaded in 5.27s.
Manually set tokenizer.chat_template to ChatML format.
 Model and Tokenizer Loaded (Extended Context) 


In [None]:
print("Configuring LoRA...")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True, # Important for longer sequences
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)
print("LoRA configured:")
print(model.print_trainable_parameters())
print(" LoRA Configuration Complete ")

Configuring LoRA...
LoRA configured:
trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338
None
 LoRA Configuration Complete 


In [None]:
print("Loading Databricks Dolly dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Create a smaller, shuffled subset
dataset = dataset.shuffle(seed=42).select(range(dataset_subset_size))

print(f"Loaded {len(dataset)} examples.")
print("Dataset features:", dataset.features)
# Columns: instruction, context, response, category
print("First example:", dataset[0])
print(" Dataset Loaded ")

Loading Databricks Dolly dataset...
Loaded 1000 examples.
Dataset features: {'instruction': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'response': Value(dtype='string', id=None), 'category': Value(dtype='string', id=None)}
First example: {'instruction': 'Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?', 'context': '', 'response': 'Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan Gold-Tree', 'category': 'open_qa'}
 Dataset Loaded 


In [None]:
# Dolly has instruction, context, response. We format as user/assistant turns.
def format_chat_prompt(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    formatted_texts = []

    for instruction, context, response in zip(instructions, contexts, responses):
        user_content = instruction
        # Prepend context to the user instruction if it exists
        if context and context.strip():
            user_content = f"Context: {context.strip()}\n\nInstruction: {instruction.strip()}"

        messages = [
            {"role": "system", "content": "You are a helpful and friendly assistant."},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": response} # The target response
        ]

        try:
            # add_generation_prompt=False as we provide the full turn
            # Since tokenizer.chat_template is now set, this should work
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            formatted_texts.append(formatted)
        except Exception as e:
            print(f"Error applying template: {e}") # Should not happen now
            formatted_texts.append("")

    # Use a different output column name
    return {"chat_formatted": formatted_texts}

print("Formatting function defined.")
print(" Formatting Function Defined ")

Formatting function defined.
 Formatting Function Defined 


In [None]:
print("Applying chat formatting...")
start_map_time = time.time()
dataset = dataset.map(
    format_chat_prompt,
    batched=True,
    remove_columns=list(dataset.features) # Remove original columns
)
end_map_time = time.time()
print(f"Formatting applied in {end_map_time - start_map_time:.2f}s.")

if 'chat_formatted' in dataset.features and len(dataset) > 0:
    print("\nExample formatted text (first 500 chars):")
    print(dataset[0]['chat_formatted'][:500])
else:
     print("\nWarning: 'chat_formatted' column missing or dataset empty.")
print(" Formatting Applied ")

Applying chat formatting...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatting applied in 0.12s.

Example formatted text (first 500 chars):
<|im_start|>system
You are a helpful and friendly assistant.<|im_end|>
<|im_start|>user
Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?<|im_end|>
<|im_start|>assistant
Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan
 Formatting Applied 


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

print(f"Configuring SFTTrainer. Output directory: {output_directory}")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="chat_formatted",
    max_seq_length=max_seq_length, # <<< Ensure trainer uses the EXTENDED length
    dataset_num_proc=2,
    packing=True, # <<< IMPORTANT: Use packing to efficiently fill the extended context window

    args=TrainingArguments(
        per_device_train_batch_size=1, # <<< May need batch size 1 with 4k context on most Colab GPUs
        gradient_accumulation_steps=16, # Effective batch size 16
        warmup_steps=5,
        max_steps=training_max_steps, # Train for fixed steps
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=5, # Log more frequently for short runs
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_directory,
        save_strategy="steps",
        save_steps=25, # Save once midway
        report_to="tensorboard",
    ),
)
print("Trainer configured for extended context with packing.")
print(" Trainer Configured ")

Configuring SFTTrainer. Output directory: tinyllama_extended_context_adapters


Generating train split: 0 examples [00:00, ? examples/s]

Trainer configured for extended context with packing.
 Trainer Configured 


In [None]:
print(f"Starting fine-tuning for extended context (max_steps={training_max_steps})...")
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleared CUDA cache.")

start_train_time = time.time()
trainer.train()
end_train_time = time.time()

print(f"Training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
print(" Training Complete ")

Starting fine-tuning for extended context (max_steps=50)...
Cleared CUDA cache.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 66 | Num Epochs = 13 | Total steps = 50
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 12,615,680/4,000,000,000 (0.32% trained)


Step,Training Loss
5,4.8401
10,4.5243
15,4.2772


In [None]:
final_adapter_dir = f"{output_directory}/final_adapters"
print(f"\nSaving final LoRA adapters to: {final_adapter_dir}")
model.save_pretrained(final_adapter_dir)
tokenizer.save_pretrained(final_adapter_dir) # Save tokenizer along with adapters
print("Adapters and tokenizer saved.")
print(" Adapters Saved ")

In [None]:
# Cell 12: Inference Test (with Long Prompt and Tensor Input Handling)

print("\nRunning Extended Context Inference Test...")
import warnings
warnings.filterwarnings("ignore") # Suppress generation warnings
import torch # Ensure torch is imported if running cell independently
import gc # Ensure gc is imported
import time # Ensure time is imported if restarting

# Prepare model for inference
# If notebook restarted, you would need to reload base model (Cell 4)
# and then load the adapters (Cell 11) using:
# from peft import PeftModel
# model = PeftModel.from_pretrained(model, final_adapter_dir) # Assuming 'model' is base model

# Ensure the model object from the previous cell (trainer or loaded PEFT) is used
# Assuming 'model' holds the fine-tuned model object from Cell 11 or previous steps
try:
    # Make sure 'model' variable exists and is the correct fine-tuned model
    if 'model' not in locals():
        raise NameError("'model' object not found. Please run previous cells.")
    FastLanguageModel.for_inference(model)
    model.eval()
    print("Model prepared for inference.")
except NameError as e:
    print(f"ERROR: {e}. Please ensure previous cells loading/training the model have been run.")
    raise
except Exception as e:
    print(f"An error occurred preparing the model for inference: {e}")
    raise

# --- Create a long prompt (Repeat text to exceed 2048 tokens) ---
base_text = """
Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans. AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. The term "artificial intelligence" had previously been used to describe machines that mimic and display "human" cognitive skills that are associated with the human mind, such as "learning" and "problem-solving". This definition has since been rejected by major AI researchers who now describe AI in terms of rationality and acting rationally, which does not limit how intelligence can be articulated.

AI applications include advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), automated decision-making, and competing at the highest level in strategic game systems (such as chess and Go). As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
"""
# Adjust repetitions if needed based on the output length check below.
num_repetitions = 6 # Try increasing if input length is still too short
long_context = (base_text + "\n\n") * num_repetitions

question = "Based *only* on the detailed text provided above about Artificial Intelligence, name three examples of AI applications mentioned."

# --- Format the prompt using the chat template ---
# Ensure tokenizer is available from previous cells
try:
    # Make sure 'tokenizer' variable exists
    if 'tokenizer' not in locals():
        raise NameError("'tokenizer' object not found. Please run Cell 4.")

    messages = [
        {"role": "system", "content": "You are an AI assistant. Answer questions based *only* on the provided text."},
        {"role": "user", "content": f"Context:\n{long_context}\n\nQuestion: {question}"}
    ]
    # This call returns the tensor directly in this case
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True, # Add the prompt for the assistant's turn
        return_tensors="pt"
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    print("Input prompt tokenized.")
except NameError as e:
    print(f"ERROR: {e}. Please ensure previous cells loading/defining objects have been run.")
    raise
except Exception as e:
    print(f"Error during tokenization: {e}")
    raise


# --- Check input length (Simpler version since 'inputs' is the tensor) ---
print("\n--- Input Tensor Check ---")
input_length = 0 # Default value
if isinstance(inputs, torch.Tensor):
    try:
        print(f"Shape of 'inputs' tensor: {inputs.shape}")
        # Get length from the last dimension (handles shape [N] or [1, N])
        input_length = inputs.shape[-1]
        print(f"Determined input_length: {input_length}")
    except Exception as e:
        print(f"Error accessing shape of 'inputs' tensor: {e}")
        # input_length remains 0
else:
    print(f"Error: 'inputs' is not a tensor. Type: {type(inputs)}")
    # input_length remains 0

# --- Length Check Logic (using determined input_length) ---
# Ensure max_seq_length is available from Cell 3
try:
    # Make sure 'max_seq_length' variable exists from Cell 3
    if 'max_seq_length' not in locals():
         raise NameError("'max_seq_length' not defined. Please run Cell 3.")
    current_max_seq_length = max_seq_length
except NameError as e:
    print(f"ERROR: {e}")
    raise

if input_length == 0:
    print("\n*** Error determining input length from tokenizer output. Check tensor info above. Cannot proceed reliably. ***")
    # Skip generation if length couldn't be determined
    generated_response_section = False
else:
    print(f"\nLength of tokenized input prompt: {input_length} tokens")
    if input_length <= 2048:
        print(f"*** WARNING: Input prompt ({input_length} tokens) is not significantly longer than the original 2048 limit. Consider increasing num_repetitions in this cell. ***")
    elif input_length > current_max_seq_length:
        # Note: Actual truncation might happen inside model.generate, even if tokenizer doesn't truncate.
        print(f"*** WARNING: Input prompt ({input_length} tokens) exceeds the model's configured max_seq_length ({current_max_seq_length}). It will likely be truncated by the model during generation. ***")
    else:
        print(f"Input prompt length ({input_length} tokens) is within the extended limit ({current_max_seq_length} tokens).")
    generated_response_section = True # OK to proceed

# Only proceed if input length was determined
if generated_response_section:
    # --- Set generation parameters ---
    generation_params = {
        "max_new_tokens": 100,
        "use_cache": True,
        "do_sample": False,
        "temperature": 0.1,
        "top_p": 0.9,
        "eos_token_id": tokenizer.eos_token_id,
        "pad_token_id": tokenizer.eos_token_id,
    }

    # --- Generate the response ---
    print("\nGenerating response based on long context...")
    response = "Error during generation." # Default error message
    try:
        start_gen_time = time.time()
        with torch.no_grad():
            # *** IMPORTANT: Pass the 'inputs' tensor directly to model.generate ***
            outputs = model.generate(inputs, **generation_params)
        end_gen_time = time.time()
        print(f"Generation took {end_gen_time - start_gen_time:.2f} seconds.")

        # Decode only the newly generated part
        # Check if output is a tensor and has expected dimensions
        if isinstance(outputs, torch.Tensor) and outputs.ndim > 0:
             # Handle potential shape difference (e.g., output might be [1, seq_len])
             output_seq_len = outputs.shape[-1]
             input_seq_len = inputs.shape[-1] # Length determined earlier
             if output_seq_len > input_seq_len:
                 # Slice using the last dimension's shape
                 response_tokens = outputs[0][input_seq_len:]
                 response = tokenizer.decode(response_tokens, skip_special_tokens=True).strip()
                 print("Response generated and decoded successfully.")
             else:
                 print("Warning: Output sequence length is not greater than input length. No new tokens generated?")
                 response = "[No new tokens generated]"
        else:
             print("Warning: Unexpected output format from model.generate(). Decoding may fail.")
             print("Output type:", type(outputs))
             # Try decoding the whole output as fallback if it's a tensor
             if isinstance(outputs, torch.Tensor) and outputs.ndim > 0:
                  response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
             else:
                  response = "[Decoding failed due to unexpected output format]"

    except Exception as gen_err:
        print(f"Error during model generation or decoding: {gen_err}")
        import traceback
        traceback.print_exc() # Print detailed traceback for generation errors


    # --- Display results ---
    print(f"\nQuestion: {question}")
    print(f"\nGenerated Response:\n{response}") # Check if it correctly extracts info

# Clean up memory regardless of success
if 'inputs' in locals(): del inputs
if 'outputs' in locals(): del outputs
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleaned up memory.")

print("\n Extended Context Inference Test Complete ")

### Question: Based only on the detailed text provided above about Artificial Intelligence, name three examples of AI applications mentioned.

Generated Response:
Context: Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans. AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. The term "artificial intelligence" had previously been used to describe machines.

# Multiple Datasets, Single Fine-tuning

In [None]:
# Using TinyLlama
model_name = "unsloth/tinyllama-bnb-4bit"
max_seq_length = 1024 # Standard length sufficient for these tasks
dtype = None
load_in_4bit = True
output_directory = "tinyllama_multi_dataset_adapters_run1" # Specific name
# Use small subsets & steps
dolly_subset_size = 500 # Number of Dolly examples
code_subset_size = 500 # Number of CodeAlpaca examples
training_max_steps = 100 # Slightly more steps for mixed tasks

print(f"Configuration:")
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  Dolly Subset Size: {dolly_subset_size}")
print(f"  Code Subset Size: {code_subset_size}")
print(f"  Max Steps: {training_max_steps}")
print(" Configuration Set ")

In [None]:
print("--- Loading Model and Tokenizer ---")
start_time = time.time()
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")

#  Add Chat Template Fix
# Manually set the ChatML template for TinyLlama tokenizer
chatml_template = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}"""
if tokenizer.chat_template is None:
     tokenizer.chat_template = chatml_template
     print("Manually set tokenizer.chat_template to ChatML format.")
else:
     print("Tokenizer chat template already set.")
# ==

# Ensure pad token is set (Unsloth usually does this, but good check)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set pad_token = eos_token")

print(" Model and Tokenizer Loaded ")

In [None]:
print("--- Configuring LoRA ---")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank 16 might be good for mixed tasks
    lora_alpha = 32,
    lora_dropout = 0, # Use 0 for speed
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)
print("LoRA configured:")
print(model.print_trainable_parameters())
print(" LoRA Configured ")

In [None]:
# --- Load Dolly Dataset ---
print("Loading Databricks Dolly dataset...")
try:
    dolly_dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
    dolly_dataset = dolly_dataset.shuffle(seed=42).select(range(dolly_subset_size))
    print(f"Loaded {len(dolly_dataset)} Dolly examples.")
    print("Dolly features:", dolly_dataset.features)
except Exception as e:
    print(f"Error loading Dolly dataset: {e}")
    raise

# --- Load CodeAlpaca Dataset ---
# Using HuggingFaceH4 version as it worked previously
print("\nLoading CodeAlpaca dataset (HuggingFaceH4)...")
try:
    code_dataset = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split="train")
    code_dataset = code_dataset.shuffle(seed=42).select(range(code_subset_size))
    print(f"Loaded {len(code_dataset)} CodeAlpaca examples.")
    print("CodeAlpaca features:", code_dataset.features)
    # Expected features: prompt, input/output or completion... verify these!
except Exception as e:
    print(f"Error loading CodeAlpaca dataset: {e}")
    raise

print("\n All Raw Datasets Loaded ")

In [None]:
print("--- Defining Formatting Functions ---")

# *** CRITICAL: Both functions must output the *SAME* final column name ('text') ***
# *** and use the *SAME* chat template structure ***

# --- Formatting function for Dolly ---
def format_dolly_for_multitask(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    texts = []
    for instruction, context, response in zip(instructions, contexts, responses):
        user_content = instruction
        if context and context.strip():
            user_content = f"Context: {context.strip()}\n\nInstruction: {instruction.strip()}"
        # Use the ChatML template structure set on the tokenizer
        messages = [
            {"role": "system", "content": "You are a helpful assistant."}, # General system prompt
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": response}
        ]
        try:
            # add_generation_prompt=False because we provide the full conversation turn
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            texts.append(formatted)
        except Exception as e:
            print(f"Error formatting Dolly example: {e}")
            texts.append("") # Append empty on error
    # *** Output column must be named 'text' ***
    return {"text": texts}
print("Dolly formatting function defined.")

# --- Formatting function for CodeAlpaca ---
# *** IMPORTANT: VERIFY these column names from Cell 6 output for CodeAlpaca_20K ***
# *** IMPORTANT: VERIFY these column names from Cell 6 output for CodeAlpaca_20K ***
code_instruction_col = "prompt"      # KEEP AS IS - Matches dataset
code_input_col = None                # CHANGE - No separate 'input' column exists
code_output_col = "completion"       # CHANGE - Matches dataset output column'

# Check if columns exist before defining function
missing_cols = []
if code_instruction_col not in code_dataset.features: missing_cols.append(code_instruction_col)
# Check input only if it's expected (not None)
if code_input_col is not None and code_input_col not in code_dataset.features: missing_cols.append(code_input_col)
if code_output_col not in code_dataset.features: missing_cols.append(code_output_col)

if missing_cols:
    print(f"\n*** FATAL ERROR: The following required column(s) for CodeAlpaca are missing: {missing_cols} ***")
    print(f"    Please check the actual features in Cell 6 output ({code_dataset.features}) and update the variables 'code_instruction_col', 'code_input_col', 'code_output_col' in this cell.")
    raise KeyError(f"Missing required CodeAlpaca columns: {missing_cols}")
else:
    print(f"Using CodeAlpaca columns: instruction='{code_instruction_col}', input='{code_input_col}', output='{code_output_col}'")

def format_codealpaca_for_multitask(examples):
    instructions = examples[code_instruction_col]
    # Handle potential absence or presence of input column correctly
    inputs = examples[code_input_col] if code_input_col in examples else [None] * len(instructions)
    outputs = examples[code_output_col]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        user_content = instruction
        # Check if input_text exists and is not just whitespace
        if input_text is not None and str(input_text).strip():
             user_content += "\n" + str(input_text) # Append input if it exists

        # Use the same ChatML template structure
        messages = [
            {"role": "system", "content": "You are a helpful assistant."}, # Consistent system prompt
            {"role": "user", "content": user_content.strip()},
            {"role": "assistant", "content": output}
        ]
        try:
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            texts.append(formatted)
        except Exception as e:
            print(f"Error formatting CodeAlpaca example: {e}")
            texts.append("")
     # *** Output column must be named 'text' ***
    return {"text": texts}
print("CodeAlpaca formatting function defined.")

print(" Formatting Functions Defined ")

In [None]:

# ... other imports ...
from datasets import load_dataset, concatenate_datasets # <<< Import concatenate_datasets
# ... rest of imports ...

print("--- Applying Formatting and Combining Datasets ---")

# Apply formatting to Dolly
print("Applying formatting to Dolly dataset...")
start_map_time = time.time()
formatted_dolly = dolly_dataset.map(
    format_dolly_for_multitask,
    batched=True,
    num_proc=2, # Use multiprocessing
    remove_columns=list(dolly_dataset.features) # Keep only 'text'
)
end_map_time = time.time()
print(f"Dolly formatted in {end_map_time - start_map_time:.2f}s. Features: {formatted_dolly.features}")

# Apply formatting to CodeAlpaca
print("\nApplying formatting to CodeAlpaca dataset...")
start_map_time = time.time()
formatted_code = code_dataset.map(
    format_codealpaca_for_multitask,
    batched=True,
    num_proc=2, # Use multiprocessing
    remove_columns=list(code_dataset.features) # Keep only 'text'
)
end_map_time = time.time()
print(f"CodeAlpaca formatted in {end_map_time - start_map_time:.2f}s. Features: {formatted_code.features}")


# --- Concatenate the datasets ---
print("\nConcatenating datasets...")
# Safety check for the 'text' column
if 'text' not in formatted_dolly.features or 'text' not in formatted_code.features:
     raise ValueError("One or both datasets are missing the required 'text' column after formatting. Check formatting functions.")

combined_dataset = concatenate_datasets([formatted_dolly, formatted_code])
print(f"Combined dataset size: {len(combined_dataset)}")

# --- Shuffle the combined dataset ---
# Shuffling is crucial so the model sees a mix of tasks during training
combined_dataset = combined_dataset.shuffle(seed=42)
print("Combined dataset shuffled.")

# Verify combined dataset structure and content
print("\nCombined dataset features:", combined_dataset.features)
if len(combined_dataset) > 0:
    print("\nExample entry 1 (first 500 chars):")
    print(combined_dataset[0]['text'][:500])
    print("\nExample entry 2 (middle element, first 500 chars):")
    print(combined_dataset[len(combined_dataset)//2]['text'][:500])
    print("\nExample entry 3 (last element, first 500 chars):")
    print(combined_dataset[-1]['text'][:500]) # Check if examples from both datasets appear mixed
else:
    print("\nWarning: Combined dataset is empty.")

print("\n Datasets Combined and Shuffled ")
# Clean up intermediate datasets to save memory
del formatted_dolly, formatted_code, dolly_dataset, code_dataset
gc.collect()

In [None]:
print("--- Configuring Trainer ---")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=combined_dataset, # <<< Use the combined dataset
    dataset_text_field="text",      # The common output column from formatting functions
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True, # Packing is generally good for mixed datasets to utilize context window

    args=TrainingArguments(
        per_device_train_batch_size=2, # Adjust based on memory
        gradient_accumulation_steps=8, # Effective batch size 16
        warmup_steps=10,
        max_steps=training_max_steps, # Train for fixed steps
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_directory, # Directory for checkpoints
        save_strategy="steps",
        save_steps=50, # Save midway through the short training
        report_to="tensorboard", # Optional logging
    ),
)
print("Trainer configured for multi-dataset fine-tuning.")
print(" Trainer Configured ")

In [None]:
print(f"--- Starting Multi-Dataset Training (max_steps={training_max_steps}) ---")
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleared CUDA cache.")

start_train_time = time.time()
trainer.train()
end_train_time = time.time()

print(f"Training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
# Observe loss, it might fluctuate more than single-task training
print(" Training Complete ")

In [None]:
final_adapter_dir = f"{output_directory}/final_adapters"
print(f"\n--- Saving final LoRA adapters to: {final_adapter_dir} ---")
trainer.model.save_pretrained(final_adapter_dir) # Save the PEFT model
tokenizer.save_pretrained(final_adapter_dir) # Save tokenizer too
print(f"Adapters and tokenizer saved to {final_adapter_dir}.")
# Verify files
!ls -lh {final_adapter_dir}
print(" Adapters Saved ")

In [None]:
print("\n--- Running Multi-Dataset Inference Test ---")
import warnings
warnings.filterwarnings("ignore")

# Prepare model for inference
# Assuming 'trainer.model' holds the fine-tuned model
final_model = trainer.model
try:
    FastLanguageModel.for_inference(final_model)
    final_model.eval()
    print("Model prepared for inference.")
except Exception as e:
    print(f"Error preparing model for inference: {e}")
    # If error, may need to reload:
    # print("Attempting to reload model and adapters...")
    # from peft import PeftModel
    # model, tokenizer = FastLanguageModel.from_pretrained(...) # Reload base model + tokenizer from Cell 4 config
    # # Add template fix again if reloading tokenizer
    # final_model = PeftModel.from_pretrained(model, final_adapter_dir) # Load adapters
    # FastLanguageModel.for_inference(final_model)
    # final_model.eval()
    # print("Model reloaded and prepared.")


# --- Test Prompt 1: Conversational (Dolly style) ---
print("\n--- Test 1: Conversational ---")
test_chat_prompt = "Explain the concept of machine learning in simple terms."
messages_chat = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": test_chat_prompt}
]
inputs_chat = tokenizer.apply_chat_template(messages_chat, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Handle tensor direct return if necessary
if not isinstance(inputs_chat, torch.Tensor): inputs_chat = inputs_chat.input_ids

generation_params_chat = {
    "max_new_tokens": 150, "use_cache": True, "do_sample": True,
    "temperature": 0.7, "top_p": 0.9, "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

print(f"User: {test_chat_prompt}")
response_chat = "[Chat Generation Failed]"
try:
    with torch.no_grad():
        outputs_chat = final_model.generate(inputs_chat, **generation_params_chat)
    # Decode response
    output_seq_len = outputs_chat.shape[-1]
    input_seq_len = inputs_chat.shape[-1]
    if output_seq_len > input_seq_len:
        response_tokens = outputs_chat[0][input_seq_len:]
        response_chat = tokenizer.decode(response_tokens, skip_special_tokens=True).strip()
    else: response_chat = "[No new tokens]"
except Exception as e: print(f"Chat generation error: {e}")
print(f"Assistant: {response_chat}")


# --- Test Prompt 2: Coding (CodeAlpaca style) ---
print("\n--- Test 2: Coding ---")
test_code_prompt = "Write a short Python function to reverse a string."
messages_code = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": test_code_prompt}
]
inputs_code = tokenizer.apply_chat_template(messages_code, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Handle tensor direct return if necessary
if not isinstance(inputs_code, torch.Tensor): inputs_code = inputs_code.input_ids


generation_params_code = {
    "max_new_tokens": 100, "use_cache": True, "do_sample": True, # Sample for code too
    "temperature": 0.5, "top_p": 0.9, "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

print(f"User: {test_code_prompt}")
response_code = "[Code Generation Failed]"
try:
    with torch.no_grad():
        outputs_code = final_model.generate(inputs_code, **generation_params_code)
    # Decode response
    output_seq_len = outputs_code.shape[-1]
    input_seq_len = inputs_code.shape[-1]
    if output_seq_len > input_seq_len:
        response_tokens = outputs_code[0][input_seq_len:]
        response_code = tokenizer.decode(response_tokens, skip_special_tokens=True).strip()
    else: response_code = "[No new tokens]"
except Exception as e: print(f"Code generation error: {e}")
print(f"Assistant:\n{response_code}") # Add newline for code formatting

# Clean up
del final_model
if 'inputs_chat' in locals(): del inputs_chat
if 'outputs_chat' in locals(): del outputs_chat
if 'inputs_code' in locals(): del inputs_code
if 'outputs_code' in locals(): del outputs_code
if 'combined_dataset' in locals(): del combined_dataset # Free dataset memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\nCleaned up memory.")

print("\n Multi-Dataset Inference Test Complete ")