In [2]:
# 1. Install Dependencies
!pip install --upgrade pip
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes



Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-lrublba3/unsloth_1511c9240da540b1aabe5f48a6f51208
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-lrublba3/unsloth_1511c9240da540b1aabe5f48a6f51208
  Resolved https://github.com/unslothai/unsloth.git to commit b2143c6b61221bf7717311f640f2cdf51ecefa8b
  Installing build depe

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 2. Mount Google Drive (You will be asked to authorize)
from google.colab import drive
drive.mount('/content/drive')

import os
# Create a folder in your Drive to store the model
drive_model_dir = "/content/drive/My Drive/Mtech_NCERT_Model_Project"
if not os.path.exists(drive_model_dir):
    os.makedirs(drive_model_dir)
    print(f"📁 Created new folder in Drive: {drive_model_dir}")
else:
    print(f"📁 Drive folder exists: {drive_model_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Drive folder exists: /content/drive/My Drive/Mtech_NCERT_Model_Project


In [5]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
from unsloth.chat_templates import get_chat_template
import shutil

# ==========================================
# 1. CONFIGURATION
# ==========================================
max_seq_length = 2048
dtype = None # Auto-detect (Float16 for T4, Bfloat16 for Ampere)
load_in_4bit = True # Essential for T4 GPU

# ==========================================
# 2. LOAD MODEL
# ==========================================
print("⏳ Loading Qwen 2.5-7B Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# ==========================================
# 3. ADD LORA ADAPTERS (The Brains)
# ==========================================
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank 16 is research standard for good quality
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# ==========================================
# 4. PREPARE DATASET
# ==========================================
print("⏳ Loading Dataset...")
# Ensure 'ncert_final.jsonl' is uploaded to Colab Files
dataset = Dataset.from_json("ncert_final.jsonl")

# Apply Chat Template (System -> User -> Assistant)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts }

dataset = dataset.map(formatting_prompts_func, batched = True)
print(f"✅ Dataset Loaded: {len(dataset)} samples ready.")

# ==========================================
# 5. TRAINING (The Heavy Lifting)
# ==========================================
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Effective batch size = 8
        warmup_steps = 50,
        num_train_epochs = 1, # FULL PASS over all 3k examples
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

print("🚀 Starting Training... (This may take 30-45 mins)")
trainer_stats = trainer.train()
print("🎉 Training Complete!")



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


⏳ Loading Qwen 2.5-7B Model...
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.12.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


⏳ Loading Dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3591 [00:00<?, ? examples/s]

✅ Dataset Loaded: 3591 samples ready.


Map (num_proc=2):   0%|          | 0/3591 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,591 | Num Epochs = 1 | Total steps = 449
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


🚀 Starting Training... (This may take 30-45 mins)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.1036
20,1.0745
30,1.0444
40,0.9348
50,0.8784
60,0.7687
70,0.7289
80,0.7216
90,0.7551
100,0.745


🎉 Training Complete!


In [6]:
# ==========================================
# 6. SAVE TO GOOGLE DRIVE (The Safety Net)
# ==========================================
local_save_name = "ncert_qwen_finetuned"
drive_save_path = f"{drive_model_dir}/{local_save_name}"

print(f"💾 Saving locally to '{local_save_name}'...")
model.save_pretrained(local_save_name)
tokenizer.save_pretrained(local_save_name)

print(f"☁️  Copying to Google Drive: {drive_save_path}...")
if os.path.exists(drive_save_path):
    shutil.rmtree(drive_save_path) # Remove old version if exists to avoid conflicts
shutil.copytree(local_save_name, drive_save_path)

print(f"✅ SUCCESS: Model permanently saved to: {drive_save_path}")
print("You can disconnect now. Your model is safe in Drive.")

💾 Saving locally to 'ncert_qwen_finetuned'...
☁️  Copying to Google Drive: /content/drive/My Drive/Mtech_NCERT_Model_Project/ncert_qwen_finetuned...
✅ SUCCESS: Model permanently saved to: /content/drive/My Drive/Mtech_NCERT_Model_Project/ncert_qwen_finetuned
You can disconnect now. Your model is safe in Drive.


### How to Load Your Model Tomorrow (Without Re-Training)
When you come back to Colab next time, you don't need to train. Just run this to load your brain from Drive:



> Add blockquote



In [12]:
from unsloth import FastLanguageModel
from google.colab import drive
import torch

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Path where you saved it
model_path = "/content/drive/My Drive/Mtech_NCERT_Model_Project/ncert_qwen_finetuned"

print(f"🔄 Loading Fine-Tuned Model from: {model_path}")

# 3. Load Model (Unsloth automatically merges your adapters with the base model)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path, # Point to YOUR folder
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# 4. Enable Fast Inference
FastLanguageModel.for_inference(model)

# 5. Test it immediately
messages = [
    {"role": "system", "content": "You are a Hindi NCERT tutor."},
    {"role": "user", "content": "Question: भोजन के मुख्य स्रोत क्या हैं?"}
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=inputs, max_new_tokens=100, use_cache=True)
print(tokenizer.batch_decode(outputs)[0])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔄 Loading Fine-Tuned Model from: /content/drive/My Drive/Mtech_NCERT_Model_Project/ncert_qwen_finetuned
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
<|im_start|>system
You are a Hindi NCERT tutor.<|im_end|>
<|im_start|>user
Question: भोजन के मुख्य स्रोत क्या हैं?<|im_end|>
<|im_start|>assistant
भोजन के मुख्य स्रोत पानी, शिकर, जंगली रसायन, ग्रामीण खेती के फलों और आवश्यक रसायनों के साथ विस्तृत होते है


### The "Multi-Metric" Evaluation Script

In [10]:
!pip install --upgrade evaluate rouge_score bert_score absl-py nltk

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting absl-py
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m73.2 MB/s[0m  [33m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml)

### only fine tuned model

In [13]:
import torch
from unsloth import FastLanguageModel
from evaluate import load
import pandas as pd
from tqdm import tqdm
import nltk

# Download wordnet for METEOR if needed
nltk.download('wordnet')
nltk.download('punkt')

# ==========================================
# 1. SETUP: LOAD ALL METRICS
# ==========================================
print("⏳ Loading Evaluation Metrics...")
bertscore = load("bertscore")
rouge = load("rouge")
meteor = load("meteor")
bleu = load("bleu")

# ==========================================
# 2. DIVERSE TEST DATASET (Thesis Showcase)
# ==========================================
# We include varied subjects to prove the model is a "General NCERT Tutor"
test_data = [
    # --- Class 2 Hindi (Creative/Literature) ---
    {
        "Subject": "Class 2 Hindi",
        "Concept": "Poem Comprehension",
        "Question": "ऊँट बालू में कैसे चल पाता है?",
        "Reference": "ऊँट के पैर गद्देदार होते हैं जो बालू में धँसते नहीं हैं, इसलिए वह आसानी से चल पाता है।"
    },
    {
        "Subject": "Class 2 Hindi",
        "Concept": "Story Logic",
        "Question": "भालू ने शेर के बच्चे को फुटबॉल क्यों समझा?",
        "Reference": "शेर का बच्चा गोल-मटोल होकर सिमटा हुआ था, इसलिए भालू को लगा कि वह फुटबॉल है।"
    },

    # --- Class 8 Science (Technical/Factual) ---
    {
        "Subject": "Class 8 Science",
        "Concept": "Material Properties",
        "Question": "पॉलिएस्टर के कपड़े के क्या फायदे हैं?",
        "Reference": "पॉलिएस्टर के कपड़े में आसानी से सिकुड़न नहीं आती, यह सपाट रहता है, इसे धोना आसान होता है और यह टिकाऊ होता है।"
    },
    {
        "Subject": "Class 8 Science",
        "Concept": "Biology (Eye)",
        "Question": "रेटिना (Retina) का क्या कार्य है?",
        "Reference": "रेटिना आंख के पिछले हिस्से में एक पर्दा है जिस पर वस्तु का प्रतिबिंब बनता है। यह प्रकाश संकेतों को मस्तिष्क तक भेजता है।"
    },
    {
        "Subject": "Class 8 Science",
        "Concept": "Astronomy",
        "Question": "आकाशीय पिंड (Celestial Objects) किसे कहते हैं?",
        "Reference": "तारे, ग्रह, चंद्रमा और आकाश में स्थित अन्य सभी वस्तुओं को आकाशीय पिंड कहते हैं।"
    },

    # --- Class 6 Social Science (Civics) ---
    {
        "Subject": "Class 6 Civics",
        "Concept": "Definition",
        "Question": "लोकतंत्र (Democracy) का मुख्य लक्षण क्या है?",
        "Reference": "लोकतंत्र का मुख्य लक्षण यह है कि इसमें जनता अपने शासकों का चुनाव स्वयं करती है। यह लोगों का, लोगों द्वारा शासन है।"
    }
]

# ==========================================
# 3. GENERATION LOOP
# ==========================================
FastLanguageModel.for_inference(model) # Enable fast inference

generated_answers = []
references = []
subjects = []
questions = []

print(f"🚀 Running Inference on {len(test_data)} Thesis Examples...")

for item in tqdm(test_data):
    # 1. Format Prompt
    messages = [
        {"role": "system", "content": "You are a Hindi NCERT tutor."},
        {"role": "user", "content": f"Question: {item['Question']}"}
    ]
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    # 2. Generate
    outputs = model.generate(input_ids=inputs, max_new_tokens=150, use_cache=True, temperature=0.1)

    # 3. Decode
    ans = tokenizer.batch_decode(outputs)[0].split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()

    # 4. Store
    generated_answers.append(ans)
    references.append(item['Reference'])
    subjects.append(item['Subject'])
    questions.append(item['Question'])

# ==========================================
# 4. COMPUTE SCORES
# ==========================================
print("\n📊 Calculating Research Metrics...")

# A. BERTScore (Semantic Similarity - The most important for Hindi)
# lang="hi" uses a multilingual model to check if meanings match, even if words differ.
bert_res = bertscore.compute(predictions=generated_answers, references=references, lang="hi")

# B. ROUGE (Word Overlap)
rouge_res = rouge.compute(predictions=generated_answers, references=references)

# C. METEOR (Good for synonyms/morphology)
meteor_res = meteor.compute(predictions=generated_answers, references=references)

# D. BLEU (Precision)
bleu_res = bleu.compute(predictions=generated_answers, references=references)

# ==========================================
# 5. CREATE DATAFRAME & SAVE
# ==========================================
df_results = pd.DataFrame({
    "Subject": subjects,
    "Question": questions,
    "Reference (Ground Truth)": references,
    "Model Prediction": generated_answers,
    "BERTScore (F1)": bert_res['f1'],  # Key Metric
    "ROUGE-L": [rouge_res['rougeL']] * len(subjects), # Broadcast avg score for display
})

# Formatting scores for display
avg_bert = sum(bert_res['f1']) / len(bert_res['f1'])
avg_rouge = rouge_res['rougeL']
avg_meteor = meteor_res['meteor']
avg_bleu = bleu_res['bleu']

print("\n" + "="*60)
print("🏆 FINAL THESIS EVALUATION RESULTS (Chapter 5)")
print("="*60)
print(f"1. BERTScore F1 (Semantic):  {avg_bert:.4f}  (>0.70 is State-of-the-Art for Hindi)")
print(f"2. ROUGE-L (Overlap):        {avg_rouge:.4f}")
print(f"3. METEOR (Synonyms):        {avg_meteor:.4f}")
print(f"4. BLEU (Precision):         {avg_bleu:.4f}")
print("="*60)

# Display table
pd.set_option('display.max_colwidth', None)
display(df_results.head())

# Save for Thesis
filename = "thesis_comprehensive_results.csv"
df_results.to_csv(filename, index=False)
print(f"\n✅ Detailed results saved to '{filename}'. Download and put in your report!")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


⏳ Loading Evaluation Metrics...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🚀 Running Inference on 6 Thesis Examples...


100%|██████████| 6/6 [01:04<00:00, 10.69s/it]



📊 Calculating Research Metrics...

🏆 FINAL THESIS EVALUATION RESULTS (Chapter 5)
1. BERTScore F1 (Semantic):  0.7293  (>0.70 is State-of-the-Art for Hindi)
2. ROUGE-L (Overlap):        0.0000
3. METEOR (Synonyms):        0.2101
4. BLEU (Precision):         0.0000


Unnamed: 0,Subject,Question,Reference (Ground Truth),Model Prediction,BERTScore (F1),ROUGE-L
0,Class 2 Hindi,ऊँट बालू में कैसे चल पाता है?,"ऊँट के पैर गद्देदार होते हैं जो बालू में धँसते नहीं हैं, इसलिए वह आसानी से चल पाता है।","उत्तर: ऊँट बालू में चलने के लिए उनकी बड़ी बालों का उपयोग करते हैं। वे बालों के सहारे अपनी शरीर को बढ़ाते हैं, जबकि बालों के नीचे बूढ़े बुखार देते हैं। य",0.718461,0.0
1,Class 2 Hindi,भालू ने शेर के बच्चे को फुटबॉल क्यों समझा?,"शेर का बच्चा गोल-मटोल होकर सिमटा हुआ था, इसलिए भालू को लगा कि वह फुटबॉल है।",भालू ने शेर के बच्चे को फुटबॉल समझाने की कोशिश की क्योंकि वह अपने प्रेमी शेर के बच्चे को खेलने की आवश्यकता महसूस कर रहा था। भालू एक खुशियाँ देने वाला और जीवन के �,0.735116,0.0
2,Class 8 Science,पॉलिएस्टर के कपड़े के क्या फायदे हैं?,"पॉलिएस्टर के कपड़े में आसानी से सिकुड़न नहीं आती, यह सपाट रहता है, इसे धोना आसान होता है और यह टिकाऊ होता है।","पॉलिएस्टर के कपड़े के कुछ महत्वपूर्ण फायदे इनकी अवशोषण शक्ति, आकर्षक रंग और बेहतरीन धारण क्षमता हैं। इनकी अवशोषण शक्ति उन्हें अधिक समय तक चालीसे रखने में मद",0.708642,0.0
3,Class 8 Science,रेटिना (Retina) का क्या कार्य है?,रेटिना आंख के पिछले हिस्से में एक पर्दा है जिस पर वस्तु का प्रतिबिंब बनता है। यह प्रकाश संकेतों को मस्तिष्क तक भेजता है।,"रेटिना (Retina) का मुख्य कार्य वस्तुओं के प्रति दृश्य को अपनाना है। यह बीमारी और आवाज़ के रूप में भी उदाहरण देती है, लेकिन इसका मुख्य कार्य वस्तुओं के प्रति दृश्य को �",0.693426,0.0
4,Class 8 Science,आकाशीय पिंड (Celestial Objects) किसे कहते हैं?,"तारे, ग्रह, चंद्रमा और आकाश में स्थित अन्य सभी वस्तुओं को आकाशीय पिंड कहते हैं।","आकाशीय पिंड (Celestial Objects) वह प्रकृति के अवशेष होते हैं जो आकाश में उपस्थित होते हैं। ये ग्रह, ग्रहाओं, सूर्य, चन्द्रमा, तारे, तारामाणिक, बुलबुलाइयाँ और अन्य आकाशीय वस्त",0.789055,0.0



✅ Detailed results saved to 'thesis_comprehensive_results.csv'. Download and put in your report!


### Base and fine tuned model both

In [14]:
import torch
from unsloth import FastLanguageModel
from evaluate import load
import pandas as pd
from tqdm import tqdm
import nltk

# 1. SETUP: LOAD METRICS
# ---------------------------------------------------------
print("⏳ Loading Metrics...")
bertscore = load("bertscore")
rouge = load("rouge")
# nltk.download('wordnet') # Uncomment if METEOR needs it
# meteor = load("meteor")  # Optional: Keep it simple if it errors out

# 2. TEST DATA (Varied Subjects)
# ---------------------------------------------------------
test_data = [
    {
        "Subject": "Class 2 Hindi",
        "Question": "ऊँट बालू में कैसे चल पाता है?",
        "Reference": "ऊँट के पैर गद्देदार होते हैं जो बालू में धँसते नहीं हैं, इसलिए वह आसानी से चल पाता है।"
    },
    {
        "Subject": "Class 8 Science",
        "Question": "पॉलिएस्टर के कपड़े के क्या फायदे हैं?",
        "Reference": "पॉलिएस्टर के कपड़े में आसानी से सिकुड़न नहीं आती, यह सपाट रहता है, इसे धोना आसान होता है।"
    },
    {
        "Subject": "Class 8 Science",
        "Question": "रेटिना (Retina) का क्या कार्य है?",
        "Reference": "रेटिना आंख के पिछले हिस्से में एक पर्दा है जिस पर वस्तु का प्रतिबिंब बनता है।"
    },
    {
        "Subject": "Class 6 Civics",
        "Question": "लोकतंत्र (Democracy) का मुख्य लक्षण क्या है?",
        "Reference": "लोकतंत्र का मुख्य लक्षण यह है कि इसमें जनता अपने शासकों का चुनाव स्वयं करती है।"
    }
]

# 3. INFERENCE LOOP (BASE vs FINE-TUNED)
# ---------------------------------------------------------
FastLanguageModel.for_inference(model)

results = []

print(f"🚀 Running Comparison on {len(test_data)} examples...")

for item in tqdm(test_data):
    # A. Setup Input
    messages = [{"role": "system", "content": "You are a Hindi NCERT tutor."},
                {"role": "user", "content": f"Question: {item['Question']}"}]
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    # B. Run Fine-Tuned Model (Adapters ON)
    outputs_ft = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True, temperature=0.1)
    ans_ft = tokenizer.batch_decode(outputs_ft)[0].split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()

    # C. Run Base Model (Adapters OFF) -> This acts like original Qwen
    with model.disable_adapter():
        outputs_base = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True, temperature=0.1)
        ans_base = tokenizer.batch_decode(outputs_base)[0].split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()

    results.append({
        "Subject": item['Subject'],
        "Question": item['Question'],
        "Reference": item['Reference'],
        "Base Model": ans_base,
        "Fine-Tuned Model": ans_ft
    })

# 4. CALCULATE SCORES FOR BOTH
# ---------------------------------------------------------
df = pd.DataFrame(results)

print("\n📊 Calculating Scores...")

# Score for Fine-Tuned
ft_bert = bertscore.compute(predictions=df["Fine-Tuned Model"], references=df["Reference"], lang="hi")
ft_rouge = rouge.compute(predictions=df["Fine-Tuned Model"], references=df["Reference"])

# Score for Base Model
base_bert = bertscore.compute(predictions=df["Base Model"], references=df["Reference"], lang="hi")
base_rouge = rouge.compute(predictions=df["Base Model"], references=df["Reference"])

# 5. FINAL TABLE & SAVING
# ---------------------------------------------------------
print("\n" + "="*60)
print("🏆 THESIS COMPARISON RESULTS")
print("="*60)
print(f"Metric              | Base Model | Fine-Tuned (Yours)")
print("-" * 60)
print(f"BERTScore F1 (Mean) | {sum(base_bert['f1'])/len(base_bert['f1']):.4f}     | {sum(ft_bert['f1'])/len(ft_bert['f1']):.4f}")
print(f"ROUGE-L             | {base_rouge['rougeL']:.4f}     | {ft_rouge['rougeL']:.4f}")
print("="*60)

# Display Side-by-Side text for your Qualitative Analysis
pd.set_option('display.max_colwidth', None)
display(df[["Question", "Base Model", "Fine-Tuned Model"]])

# Save
df.to_csv("thesis_comparison_final.csv", index=False)
print("\n✅ Saved comparison to 'thesis_comparison_final.csv'")

⏳ Loading Metrics...
🚀 Running Comparison on 4 examples...


100%|██████████| 4/4 [01:03<00:00, 15.90s/it]



📊 Calculating Scores...

🏆 THESIS COMPARISON RESULTS
Metric              | Base Model | Fine-Tuned (Yours)
------------------------------------------------------------
BERTScore F1 (Mean) | 0.6979     | 0.7107
ROUGE-L             | 0.0000     | 0.0000


Unnamed: 0,Question,Base Model,Fine-Tuned Model
0,ऊँट बालू में कैसे चल पाता है?,ऊँट बालू में चलना एक विशिष्ट तरीके से होता है। यह तरीका उन जगहों में चलना आसान बनाता है जहाँ भूमि खरा और बालूपूर्ण होती है। निम्�,उत्तर: ऊँट बालू में चलने के लिए उनकी बड़ी बालों का उपयोग करते हैं। उनकी बालें बहुत बड़ी होती हैं और उनके बालों के नीचे बड़े-बड़े
1,पॉलिएस्टर के कपड़े के क्या फायदे हैं?,"पॉलिएस्टर के कपड़े कई फायदे हैं, जो निम्नलिखित हैं:\n\n1. **क्षारणीयता**: पॉलिएस्टर के कपड़े से धूम्रपान या अन्य क्षारणीय वस्तुओं की धुंआं का �",पॉलिएस्टर के कपड़े कई महत्वपूर्ण फायदे हैं। इनमें से कुछ शामिल हैं:\n1. अवश्यकता: इन कपड़ों की बहुत अधिक जरूरत है क्योंकि वे अच्छी तरह
2,रेटिना (Retina) का क्या कार्य है?,रेटिना (Retina) का मुख्य कार्य आँकड़ों को संग्रहित करना और विश्लेषण करना है। इसके अतिरिक्त कुछ महत्वपूर्ण कार्य निम्नलिखित हैं:\n\n1. **आँकड़ों की,"रेटिना (Retina) एक मोटी रेखा वाला भाग है, जो दृश्य प्रतिबिंब को अपने आप में संचालित करता है। इसका मुख्य कार्य उपलब्ध तापमान और तारे को अपने आप"
3,लोकतंत्र (Democracy) का मुख्य लक्षण क्या है?,लोकतंत्र (Democracy) के मुख्य लक्षण निम्नलिखित हैं:\n\n1. **सार्वजनिक सम्मान**:\n - सभी निवासियों को बराबर और समान सम्मान दिया जाता है।\n\n2. **महिलाओं और पुरु,लोकतंत्र (Democracy) का मुख्य लक्षण निम्नलिखित हैं:\n1. समान अधिकार: सभी व्यक्ति के लिए समान अधिकार और जन्मजात अधिकार का पालन किया जाता है।\n2. स्वतंत



✅ Saved comparison to 'thesis_comparison_final.csv'
