<a href="https://colab.research.google.com/github/sriram5156689/1/blob/main/Compass_AI_The_AI_Forge_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================================================
# 🏫 SIH Hackathon - Multilingual Chatbot Fine-tuning
# Problem Statement 25104 - Language Agnostic Chatbot
# Team Notebook
# =========================================================

# --- Install Libraries ---
!pip install -U -q transformers datasets accelerate peft trl bitsandbytes sentencepiece

from huggingface_hub import notebook_login
notebook_login()   # paste your HF token here

import torch

# --- Configuration ---
BASE_MODEL_NAME = "facebook/mbart-large-50"   # multilingual base
NEW_MODEL_NAME  = "sih_multilingual_chatbot"
HF_USERNAME     = "YourHFUsername"  # <<< replace with your username

# Dataset choice
BASE_DATASET_NAME = "databricks/databricks-dolly-15k"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd
import io

# Load base dataset (general instructions)
print(f"Loading base dataset: {BASE_DATASET_NAME}")
base_dataset = load_dataset(BASE_DATASET_NAME, split="train")

# Custom "Golden Dataset" with FAQs in Hindi/English/Regional language
golden_data_string = """{"question": "What is the deadline for the BTER exam fee payment?", "ideal_answer": "As per circular BTER/EXAM/2025-F1, the final date is October 30, 2025.", "context": "Board of Technical Education Rajasthan circular."}
{"question": "बीटीईआर परीक्षा शुल्क का अंतिम दिन कब है?", "ideal_answer": "BTER/EXAM/2025-F1 के अनुसार अंतिम तिथि 30 अक्टूबर 2025 है।", "context": "राजस्थान तकनीकी शिक्षा बोर्ड परिपत्र।"}
{"question": "Scholarship application के लिए किन documents की ज़रूरत है?", "ideal_answer": "Aadhar card, caste certificate और पिछले वर्ष का marksheet जरूरी है।", "context": "DTE scholarship guidelines."}
{"question": "હોસ્ટેલ ફી ક્યારે જમા કરવી પડશે?", "ideal_answer": "હોસ્ટેલ ફી ભરવાની અંતિમ તારીખ 15 નવેમ્બર 2025 છે.", "context": "Hostel office notice (Gujarati)."}"""

golden_df = pd.read_json(io.StringIO(golden_data_string), lines=True)
golden_dataset = Dataset.from_pandas(golden_df)

# Combine
dataset = concatenate_datasets([base_dataset, golden_dataset]).shuffle(seed=42)

# Formatting
def format_instruction(example):
    if "ideal_answer" in example:
        return f"""System: You are a multilingual college assistant.
Answer ONLY from context. Use the same language as the question.

--- CONTEXT ---
{example['context']}

--- QUESTION ---
{example['question']}

--- ANSWER ---
{example['ideal_answer']}"""
    else:
        return f"""System: Helpful assistant.

--- CONTEXT ---
{example.get('context','')}

--- QUESTION ---
{example.get('instruction','')}

--- ANSWER ---
{example.get('response','')}"""


Loading base dataset: databricks/databricks-dolly-15k


In [17]:
# ✅ Fine-tuning Mistral-7B + LoRA using transformers.Trainer (no SFTTrainer)

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# -------------------------
# Config
# -------------------------
BASE_MODEL_NAME = "mistralai/Mistral-7B-v0.1"
NEW_MODEL_NAME = "mistral7b-lora-finetuned"
HF_USERNAME = "dragoon1"  # change to your HF username

# -------------------------
# Load base model (4-bit quantization)
# -------------------------
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# -------------------------
# Tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# LoRA Config
# -------------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# -------------------------
# Load Dataset
# -------------------------
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# -------------------------
# Data Collator
# -------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# -------------------------
# Training Arguments
# -------------------------
training_args = TrainingArguments(
    output_dir=f"./{NEW_MODEL_NAME}",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id=f"{HF_USERNAME}/{NEW_MODEL_NAME}",
    report_to="none"
)

# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args
)

# -------------------------
# Train
# -------------------------
print("🚀 Starting fine-tuning...")
trainer.train()

# -------------------------
# Push to Hub
# -------------------------
trainer.push_to_hub()
print("✅ Model pushed to Hugging Face Hub!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

  trainer = Trainer(


[2025-09-08 12:58:12,169] [INFO] [real_accelerator.py:260:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-09-08 12:58:17,601] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


🚀 Starting fine-tuning...


Step,Training Loss
10,2.4138
20,2.2142
30,2.5619
40,2.5711
50,2.013


KeyboardInterrupt: 

In [20]:
# ✅ Fine-tuning Mistral-7B + LoRA using Trainer on TPU

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# -------------------------
# Config
# -------------------------
BASE_MODEL_NAME = "mistralai/Mistral-7B-v0.1"
NEW_MODEL_NAME = "mistral7b-lora-finetuned"
HF_USERNAME = "dragoon1"

# -------------------------
# Load base model (FP16/BF16 for TPU)
# -------------------------
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.bfloat16
)

# -------------------------
# Tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# LoRA Config
# -------------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# -------------------------
# Data Collator
# -------------------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# -------------------------
# Training Arguments (TPU)
# -------------------------
training_args = TrainingArguments(
    output_dir=f"./{NEW_MODEL_NAME}",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # adjust based on TPU memory
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id=f"{HF_USERNAME}/{NEW_MODEL_NAME}",
    report_to="none"
)


# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args
)

# -------------------------
# Train
# -------------------------
print("🚀 Starting fine-tuning on TPU...")
trainer.train()

# -------------------------
# Push to Hub
# -------------------------
trainer.push_to_hub()
print("✅ Model pushed to Hugging Face Hub!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 110.12 MiB is free. Process 2519 has 14.63 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 184.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
test_prompt = """System: You are a multilingual assistant.
--- CONTEXT ---
Board of Technical Education Rajasthan: final date for payment of exam fees is October 30, 2025.
--- QUESTION ---
बीटीईઆર परीक्षा शुल्क की अंतिम तिथि कब है?
--- ANSWER ---
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
# Merge LoRA + base model
from peft import AutoPeftModelForSeq2SeqLM
ft_model = AutoPeftModelForSeq2SeqLM.from_pretrained(f"{HF_USERNAME}/{NEW_MODEL_NAME}", torch_dtype=torch.float16)
merged_model = ft_model.merge_and_unload()

save_path = f"./{NEW_MODEL_NAME}-merged"
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Convert to GGUF
!git clone https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!pip install -r requirements.txt

!python convert.py {save_path} --outfile /content/{NEW_MODEL_NAME}.gguf --outtype q4_k_m

from google.colab import files
files.download(f"/content/{NEW_MODEL_NAME}.gguf")
