<a href="https://colab.research.google.com/github/ratnesh90859/AI/blob/main/Custom_TinyLlama_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import subprocess
import sys
import os

def run_command(cmd):
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            return False
        return result.returncode == 0
    except Exception as e:
        return False

gpu_available = run_command("nvidia-smi")
if not gpu_available:
    import time
    time.sleep(10)

packages = [
    "transformers==4.36.0",
    "datasets==2.14.0",
    "peft==0.7.0",
    "trl==0.7.0",
    "bitsandbytes==0.41.0",
    "accelerate==0.24.0",
    "torch==2.1.0",
    "sentencepiece==0.1.99"
]

for package in packages:
    if not run_command(f"pip install {package}"):
        sys.exit(1)

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
import json
import gc

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "./finetuned-model"
HF_TOKEN = None

config = {
    "max_seq_length": 512,
    "batch_size": 1,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "num_epochs": 3,
    "warmup_steps": 10,
    "logging_steps": 10,
    "save_steps": 100,
    "eval_steps": 100,
}

In [None]:
sample_data = [
    {
        "instruction": "What is the capital of France?",
        "input": "",
        "output": "The capital of France is Paris."
    },
    {
        "instruction": "Explain what machine learning is",
        "input": "",
        "output": "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed."
    },
    {
        "instruction": "Write a simple Python function to add two numbers",
        "input": "",
        "output": "```python\ndef add_numbers(a, b):\n    return a + b\n```"
    },
    {
        "instruction": "What are the benefits of exercise?",
        "input": "",
        "output": "Exercise has many benefits including improved cardiovascular health, stronger muscles and bones, better mental health, and increased energy levels."
    },
    {
        "instruction": "How do you make a cup of tea?",
        "input": "",
        "output": "To make tea: 1) Boil water, 2) Add tea bag or leaves to cup, 3) Pour hot water over tea, 4) Steep for 3-5 minutes, 5) Remove tea bag/strain, 6) Add milk/sugar if desired."
    }
]



In [None]:
def format_prompt(example):
    """Format the prompt in chat format"""
    if example["input"]:
        prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\n{example['instruction']}\n{example['input']}</s>\n<|assistant|>\n{example['output']}</s>"
    else:
        prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\n{example['instruction']}</s>\n<|assistant|>\n{example['output']}</s>"
    return {"text": prompt}


dataset = Dataset.from_list(sample_data)
dataset = dataset.map(format_prompt, remove_columns=dataset.column_names)


train_dataset = dataset.select(range(int(0.8 * len(dataset))))
eval_dataset = dataset.select(range(int(0.8 * len(dataset)), len(dataset)))



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:


# Check if GPU is available for quantization
if torch.cuda.is_available():
    # Quantization config for memory efficiency (GPU only)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    quantization_config = bnb_config
    device_map = "auto"
    torch_dtype = torch.bfloat16
else:
    # CPU configuration - no quantization
    quantization_config = None
    device_map = None
    torch_dtype = torch.float32
    print("⚠️  Running on CPU - training will be very slow!")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()





tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.570145479653396


In [None]:
def tokenize_function(examples):
    """Tokenize the dataset"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=config["max_seq_length"],
        return_tensors="pt"
    )

print("\n=== Tokenizing Dataset ===")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])




=== Tokenizing Dataset ===


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
# Check GPU capabilities and set precision accordingly
def check_gpu_capabilities():
    if not torch.cuda.is_available():
        return {"fp16": False, "bf16": False, "optim": "adamw_torch"}

    try:
        # Check if bf16 is supported
        device_capability = torch.cuda.get_device_capability()
        supports_bf16 = device_capability[0] >= 8  # Ampere GPUs (RTX 30xx, A100, etc.)

        if supports_bf16:
            # Try to create a bf16 tensor to double-check
            torch.tensor([1.0], dtype=torch.bfloat16, device="cuda")
            return {"fp16": False, "bf16": True, "optim": "paged_adamw_32bit"}
        else:
            return {"fp16": True, "bf16": False, "optim": "adamw_torch"}
    except:
        # Fallback to fp16 if bf16 test fails
        return {"fp16": True, "bf16": False, "optim": "adamw_torch"}

precision_config = check_gpu_capabilities()


# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    optim=precision_config["optim"],
    learning_rate=config["learning_rate"],
    lr_scheduler_type="cosine",
    warmup_steps=config["warmup_steps"],
    num_train_epochs=config["num_epochs"],
    evaluation_strategy="steps",
    eval_steps=config["eval_steps"],
    logging_steps=config["logging_steps"],
    save_steps=config["save_steps"],
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=precision_config["fp16"],
    bf16=precision_config["bf16"],
    remove_unused_columns=False,
    report_to="none",
    dataloader_pin_memory=False,
)

# Custom data collator
class DataCollatorForCausalLM:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, examples):
        batch = {}
        input_ids = [example["input_ids"] for example in examples]
        attention_mask = [example["attention_mask"] for example in examples]

        # Pad sequences
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(seq) for seq in input_ids],
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id
        )
        attention_mask = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(seq) for seq in attention_mask],
            batch_first=True,
            padding_value=0
        )

        batch["input_ids"] = input_ids
        batch["attention_mask"] = attention_mask
        batch["labels"] = input_ids.clone()

        return batch

data_collator = DataCollatorForCausalLM(tokenizer)

In [None]:
print("\n=== Starting Training ===")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Clear cache before training
torch.cuda.empty_cache()
gc.collect()

# Start training
try:
    trainer.train()
    print(" Training completed successfully!")
except Exception as e:
    print(f" Training failed: {e}")
    raise


=== Starting Training ===


Step,Training Loss,Validation Loss


 Training completed successfully!


In [None]:
print("\n=== Saving Model ===")

# Save the model
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

# Save training configuration
with open(f"{OUTPUT_DIR}/training_config.json", "w") as f:
    json.dump(config, f, indent=2)

print(f" Model saved to {OUTPUT_DIR}")



=== Saving Model ===
 Model saved to ./finetuned-model


In [None]:
print("\n=== Testing Fine-tuned Model ===")

# Load the fine-tuned model for inference
model.eval()

def generate_response(prompt, max_length=100):
    """Generate response using the fine-tuned model"""
    formatted_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    response = response.split("<|assistant|>")[-1].strip()
    return response

# Test with sample prompts
test_prompts = [
    "What is Python?",
    "How do you create a list in Python?",
    "What are the benefits of reading books?",
    "How do you make a cup of tea?"
]

for prompt in test_prompts:
    print(f"\n🔸 Prompt: {prompt}")
    response = generate_response(prompt)
    print(f"🔹 Response: {response}")

print("\n Fine-tuning completed successfully!")
print(f" Model saved to: {OUTPUT_DIR}")
print(" You can now use your fine-tuned model for inference!")


=== Testing Fine-tuned Model ===

🔸 Prompt: What is Python?
🔹 Response: Python is a programming language that is designed to be easy to learn and use. It is popular for its ability to create interactive graphical user interfaces (GUIs) and its use in web development. Python is a high-level, interpreted, dynamic language that compiles to machine code, making it fast and efficient

🔸 Prompt: How do you create a list in Python?
🔹 Response: To create a list in Python, you use the `list()` function. Here's an example:

```python
my_list = [1, 2, 3, 4, 5]
print(my_list)
```

This creates a

🔸 Prompt: What are the benefits of reading books?
🔹 Response: 1. Improves vocabulary and grammar: Reading books provides a rich and diverse range of vocabulary and grammar, which can help in improving your language skills.

2. Enhances cognitive abilities: Reading books can help improve your cognitive abilities

🔸 Prompt: How do you make a cup of tea?
🔹 Response: To make a cup of tea, follow these steps: