<a href="https://colab.research.google.com/github/sathu0622/25-26J-438-AI-Powered-LMS-for-Visually-Impaired-Students/blob/AI-Powered-Braille-to-Text-Conversion-and-Automated-Evaluation-System-for-O%2FL-History-Examinations/meta_llama_Meta_Llama_3_8B_Instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
#  COMPLETE O/L HISTORY MODEL TRAINING CODE
#  Google Colab Pro - Run All Cells
# ============================================================

# ============================================================
#  STEP 1 ‚Äî Install Dependencies
# ============================================================
!pip install -q transformers accelerate bitsandbytes peft datasets sentencepiece openpyxl scikit-learn

# ============================================================
#  STEP 2 ‚Äî Mount Google Drive & Clear Memory
# ============================================================
from google.colab import drive
drive.mount('/content/drive')

# Clear any cached memory
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
print("‚úÖ Memory cleared")

# ============================================================
#  STEP 3 ‚Äî Import Libraries
# ============================================================
import pandas as pd
import json
import torch
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from sklearn.model_selection import train_test_split
from huggingface_hub import login

print("‚úÖ All libraries imported successfully")

# ============================================================
#  STEP 3.5 ‚Äî Hugging Face Authentication
# ============================================================
print("\n" + "="*60)
print("üîê HUGGING FACE AUTHENTICATION REQUIRED")
print("="*60)
print("Llama 3.1 is a gated model. You need to:")
print("1. Go to: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct")
print("2. Click 'Request Access' and accept the terms")
print("3. Create a token at: https://huggingface.co/settings/tokens")
print("4. Enter your token below")
print("="*60 + "\n")

# Get token from user
from getpass import getpass
hf_token = getpass("Enter your Hugging Face token (input will be hidden): ")

# Login to Hugging Face
try:
    login(token=hf_token, add_to_git_credential=True)
    print("‚úÖ Successfully authenticated with Hugging Face!")
except Exception as e:
    print(f"‚ùå Authentication failed: {str(e)}")
    print("\nPlease make sure:")
    print("1. You've requested access to Llama 3.1 model")
    print("2. Your access has been approved (check your email)")
    print("3. Your token has 'read' permissions")
    raise

# ============================================================
#  STEP 4 ‚Äî Load Excel Dataset from Google Drive
# ============================================================
dataset_path = "/content/drive/MyDrive/Model/Final.xlsx"

try:
    df = pd.read_excel(dataset_path)

    # Check and standardize column names
    df.columns = df.columns.str.strip()  # Remove any whitespace

    # Handle different possible column names
    column_mapping = {}
    for col in df.columns:
        col_lower = col.lower()
        if 'question' in col_lower:
            column_mapping[col] = 'question'
        elif 'answer' in col_lower:
            column_mapping[col] = 'answer'

    df = df.rename(columns=column_mapping)

    # Verify required columns exist
    if 'question' not in df.columns or 'answer' not in df.columns:
        raise ValueError(f"Required columns not found. Found columns: {list(df.columns)}")

    print(f"‚úÖ Loaded {len(df)} questions from dataset")
    print(f"‚úÖ Columns: {list(df.columns)}")
    print("\nFirst 5 rows:")
    print(df.head())

except FileNotFoundError:
    print("‚ùå Error: Dataset file not found!")
    print(f"Please ensure your Excel file is at: {dataset_path}")
    print("Expected columns: 'question' and 'answer'")
except Exception as e:
    print(f"‚ùå Error loading dataset: {str(e)}")
    print(f"Columns found: {list(df.columns) if 'df' in locals() else 'Unable to read file'}")

# ============================================================
#  STEP 5 ‚Äî Format Dataset for Llama 3.1 Training
# ============================================================
def format_instruction(question, answer):
    """Format in Llama 3.1 Instruct chat template"""
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in Sri Lankan O/L History. Provide accurate and concise answers to history questions.<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}<|eot_id|>"""

# Create formatted training data
formatted_data = []
for _, row in df.iterrows():
    formatted_text = format_instruction(row["question"], row["answer"])
    formatted_data.append({
        "text": formatted_text
    })

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))
print(f"‚úÖ Formatted {len(train_dataset)} training examples")

# ============================================================
#  STEP 6 ‚Äî Load Llama 3.1 8B Model with 4-bit Quantization
# ============================================================
model_name = "meta-llama/Llama-3.1-8B-Instruct"

print("\nüîÑ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = "right"
print("‚úÖ Tokenizer loaded")

print("\nüîÑ Loading model with 4-bit quantization...")

from transformers import BitsAndBytesConfig

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
print("‚úÖ Model loaded successfully")

# ============================================================
#  STEP 7 ‚Äî Configure LoRA for Efficient Fine-tuning
# ============================================================
lora_config = LoraConfig(
    r=16,                                    # LoRA rank
    lora_alpha=32,                           # LoRA alpha scaling
    target_modules=[                         # Target attention modules
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.05,                       # Dropout for regularization
    bias="none",                             # Don't train biases
    task_type="CAUSAL_LM"                    # Task type
)

model = get_peft_model(model, lora_config)
print("\n‚úÖ LoRA configuration applied")
model.print_trainable_parameters()

# ============================================================
#  STEP 8 ‚Äî Tokenize Dataset
# ============================================================
def tokenize_function(examples):
    """Tokenize the text data"""
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )
    # Set labels for causal language modeling
    result["labels"] = result["input_ids"].copy()
    return result

print("\nüîÑ Tokenizing dataset...")
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing"
)
print("‚úÖ Dataset tokenized")

# ============================================================
#  STEP 9 ‚Äî Create Validation Split
# ============================================================
from sklearn.model_selection import train_test_split

# Split dataset: 80% training, 20% validation
train_indices, val_indices = train_test_split(
    range(len(tokenized_dataset)),
    test_size=0.2,
    random_state=42
)

train_subset = tokenized_dataset.select(train_indices)
val_subset = tokenized_dataset.select(val_indices)

print(f"\n‚úÖ Dataset split:")
print(f"   Training samples: {len(train_subset)}")
print(f"   Validation samples: {len(val_subset)}")

# ============================================================
#  STEP 10 ‚Äî Set Up Training Arguments
# ============================================================
output_dir = "/content/drive/MyDrive/Model/ol_history_model"

training_args = TrainingArguments(
    # Output directory
    output_dir=output_dir,

    # Training parameters
    num_train_epochs=3,                      # Number of epochs
    per_device_train_batch_size=1,           # Reduced from 2 to 1
    per_device_eval_batch_size=1,            # Reduced from 2 to 1
    gradient_accumulation_steps=8,           # Increased from 4 to 8

    # Optimizer settings
    learning_rate=2e-4,                      # Learning rate
    weight_decay=0.01,                       # Weight decay
    warmup_steps=50,                         # Warmup steps
    optim="paged_adamw_8bit",               # 8-bit optimizer

    # Evaluation settings
    eval_strategy="epoch",                   # Evaluate after each epoch
    eval_steps=None,                         # Eval every N steps (None = use strategy)
    load_best_model_at_end=True,            # Load best model at end
    metric_for_best_model="eval_loss",      # Metric to track
    eval_accumulation_steps=4,               # Accumulate eval to save memory

    # Logging and saving
    logging_steps=20,                        # Increased from 10
    save_strategy="epoch",                   # Save after each epoch
    save_total_limit=1,                      # Keep only 1 checkpoint (was 2)

    # Performance & Memory optimization
    fp16=True,                               # Mixed precision training
    gradient_checkpointing=True,             # Save memory
    max_grad_norm=0.3,                       # Gradient clipping

    # Other settings
    report_to="none",                        # Don't report to wandb/tensorboard
    remove_unused_columns=False,             # Keep all columns
    dataloader_pin_memory=False,             # Disable pin memory to save RAM
)

print("\n‚úÖ Training arguments configured (Memory optimized)")

# ============================================================
#  STEP 11 ‚Äî Define Accuracy Metrics
# ============================================================
def compute_metrics(eval_pred):
    """
    Compute perplexity and accuracy metrics for evaluation
    """
    predictions, labels = eval_pred

    # Calculate perplexity from loss
    # Perplexity = exp(loss)
    loss = np.mean(predictions)
    perplexity = np.exp(loss)

    return {
        "perplexity": perplexity,
    }

print("‚úÖ Metrics function defined")

# ============================================================
#  STEP 12 ‚Äî Initialize Trainer with Metrics
# ============================================================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized with validation dataset")

# ============================================================
#  STEP 13 ‚Äî Train the Model with Evaluation
# ============================================================

# Clear memory before training
import gc
gc.collect()
torch.cuda.empty_cache()
print("‚úÖ Memory cleared before training")

print("\n" + "="*60)
print("üöÄ STARTING TRAINING WITH VALIDATION")
print("="*60)
print(f"Training samples: {len(train_subset)}")
print(f"Validation samples: {len(val_subset)}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print("="*60 + "\n")

# Start training
try:
    train_result = trainer.train()
    print("\n‚úÖ Training completed!")
except RuntimeError as e:
    if "out of memory" in str(e):
        print("\n‚ùå GPU Out of Memory Error!")
        print("\nüîß Solutions:")
        print("1. Restart runtime: Runtime ‚Üí Restart runtime")
        print("2. After restart, the code will use even smaller batch size")
        print("3. Or reduce max_length in tokenization (line 189) to 256")
        raise
    else:
        raise

# ============================================================
#  STEP 14 ‚Äî Display Training Results
# ============================================================
print("\n" + "="*60)
print("üìä TRAINING RESULTS")
print("="*60)

# Training metrics
metrics = train_result.metrics
print(f"Final Training Loss: {metrics.get('train_loss', 'N/A'):.4f}")
print(f"Training Runtime: {metrics.get('train_runtime', 0):.2f} seconds")
print(f"Samples per second: {metrics.get('train_samples_per_second', 0):.2f}")

# Get final evaluation metrics
print("\nüîç Evaluating on validation set...")
eval_metrics = trainer.evaluate()

print("\n" + "="*60)
print("üìà VALIDATION RESULTS")
print("="*60)
print(f"Validation Loss: {eval_metrics.get('eval_loss', 'N/A'):.4f}")
print(f"Perplexity: {eval_metrics.get('eval_perplexity', 'N/A'):.4f}")
print(f"Validation Runtime: {eval_metrics.get('eval_runtime', 0):.2f} seconds")
print("="*60)

# Calculate improvement metrics
print("\nüìâ TRAINING PROGRESS:")
if hasattr(trainer.state, 'log_history'):
    # Get first and last training loss
    train_losses = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
    if len(train_losses) >= 2:
        initial_loss = train_losses[0]
        final_loss = train_losses[-1]
        improvement = ((initial_loss - final_loss) / initial_loss) * 100
        print(f"Initial Training Loss: {initial_loss:.4f}")
        print(f"Final Training Loss: {final_loss:.4f}")
        print(f"Loss Reduction: {improvement:.2f}%")

    # Get validation losses per epoch
    eval_losses = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
    if eval_losses:
        print(f"\nüìä Validation Loss per Epoch:")
        for i, loss in enumerate(eval_losses, 1):
            print(f"   Epoch {i}: {loss:.4f}")

print("="*60)

# ============================================================
#  STEP 15 ‚Äî Save the Fine-tuned Model
# ============================================================
final_model_path = f"{output_dir}/final_lora_model"

print(f"\nüîÑ Saving model to {final_model_path}...")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Save training metrics
metrics_path = f"{output_dir}/training_metrics.json"
all_metrics = {
    "train_loss": metrics.get('train_loss'),
    "train_runtime": metrics.get('train_runtime'),
    "train_samples_per_second": metrics.get('train_samples_per_second'),
    "eval_loss": eval_metrics.get('eval_loss'),
    "eval_perplexity": eval_metrics.get('eval_perplexity'),
    "eval_runtime": eval_metrics.get('eval_runtime'),
    "num_train_samples": len(train_subset),
    "num_val_samples": len(val_subset),
    "num_epochs": training_args.num_train_epochs,
}

import json
with open(metrics_path, 'w') as f:
    json.dump(all_metrics, f, indent=2)

print(f"‚úÖ Metrics saved to {metrics_path}")

print("\n" + "="*60)
print("‚úÖ MODEL TRAINING COMPLETE!")
print("="*60)
print(f"Model saved at: {final_model_path}")
print(f"Training Loss: {metrics.get('train_loss', 'N/A'):.4f}")
print(f"Validation Loss: {eval_metrics.get('eval_loss', 'N/A'):.4f}")
print(f"Perplexity: {eval_metrics.get('eval_perplexity', 'N/A'):.4f}")
print("\nYou can now use this model for evaluation.")
print("Next step: Run the evaluation code to test student answers.")
print("="*60)

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
‚úÖ Memory cleared
‚úÖ All libraries imported successfully

üîê HUGGING FACE AUTHENTICATION REQUIRED
Llama 3.1 is a gated model. You need to:
1. Go to: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
2. Click 'Request Access' and accept the terms
3. Create a token at: https://huggingface.co/settings/tokens
4. Enter your token below

Enter your Hugging Face token (input will be hidden): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ Successfully authenticated with Hugging Face!
‚úÖ Loaded 806 questions from dataset
‚úÖ Columns: ['question', 'answer']

First 5 rows:
                                            question  \
0  State two ways in which inheritance to the thr...   
1  Describe two duties which the ancient kings co...   
2  Explain with reference to three points t

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

‚úÖ Tokenizer loaded

üîÑ Loading model with 4-bit quantization...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

‚úÖ Model loaded successfully

‚úÖ LoRA configuration applied
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196

üîÑ Tokenizing dataset...


Tokenizing:   0%|          | 0/806 [00:00<?, ? examples/s]

‚úÖ Dataset tokenized

‚úÖ Dataset split:
   Training samples: 644
   Validation samples: 162

‚úÖ Training arguments configured (Memory optimized)
‚úÖ Metrics function defined
‚úÖ Trainer initialized with validation dataset
‚úÖ Memory cleared before training

üöÄ STARTING TRAINING WITH VALIDATION
Training samples: 644
Validation samples: 162
Epochs: 3
Batch size: 1
Gradient accumulation: 8
Effective batch size: 8



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Perplexity
1,1.5054,1.34451,1.089545
2,1.1944,1.282621,1.208353
3,0.734,1.369885,1.258573



‚úÖ Training completed!

üìä TRAINING RESULTS
Final Training Loss: 1.3205
Training Runtime: 2303.87 seconds
Samples per second: 0.84

üîç Evaluating on validation set...



üìà VALIDATION RESULTS
Validation Loss: 1.2826
Perplexity: 1.2084
Validation Runtime: 376.27 seconds

üìâ TRAINING PROGRESS:
Initial Training Loss: 2.8629
Final Training Loss: 0.7340
Loss Reduction: 74.36%

üìä Validation Loss per Epoch:
   Epoch 1: 1.3445
   Epoch 2: 1.2826
   Epoch 3: 1.3699
   Epoch 4: 1.2826

üîÑ Saving model to /content/drive/MyDrive/Model/ol_history_model/final_lora_model...
‚úÖ Metrics saved to /content/drive/MyDrive/Model/ol_history_model/training_metrics.json

‚úÖ MODEL TRAINING COMPLETE!
Model saved at: /content/drive/MyDrive/Model/ol_history_model/final_lora_model
Training Loss: 1.3205
Validation Loss: 1.2826
Perplexity: 1.2084

You can now use this model for evaluation.
Next step: Run the evaluation code to test student answers.


In [None]:
# ============================================================
#  O/L HISTORY ANSWER EVALUATION SYSTEM - COMPLETE & FIXED
#  Model: LLaMA 3.1 + LoRA
#  Evaluation: Multi-Metric Scoring
# ============================================================

# =======================
# STEP 1 ‚Äî Install libs
# =======================
!pip install -q sentence-transformers transformers bitsandbytes peft accelerate huggingface_hub scikit-learn

# =======================
# STEP 2 ‚Äî Mount Drive
# =======================
from google.colab import drive
drive.mount('/content/drive')

# =======================
# STEP 3 ‚Äî Hugging Face Login
# =======================
from huggingface_hub import login
from getpass import getpass

print("\n" + "="*60)
print("üîê HUGGING FACE AUTHENTICATION")
print("="*60)
print("Enter your Hugging Face token for model access")
print("="*60 + "\n")

hf_token = getpass("Enter your Hugging Face token: ")

try:
    login(token=hf_token, add_to_git_credential=True)
    print("‚úÖ Successfully authenticated with Hugging Face!")
except Exception as e:
    print(f"‚ùå Authentication failed: {str(e)}")
    raise

# =======================
# STEP 4 ‚Äî Imports
# =======================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

print("‚úÖ All libraries imported")

# =======================
# STEP 5 ‚Äî Model Paths
# =======================
BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
LORA_MODEL_PATH = "/content/drive/MyDrive/Model/ol_history_model/final_lora_model"

# =======================
# STEP 6 ‚Äî Quantization Config
# =======================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# =======================
# STEP 7 ‚Äî Load Tokenizer & Base Model
# =======================
print("\nüîÑ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("‚úÖ Tokenizer loaded")

print("\nüîÑ Loading base model with 4-bit quantization...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)
print("‚úÖ Base model loaded")

# =======================
# STEP 8 ‚Äî Load LoRA Weights
# =======================
print("\nüîÑ Loading fine-tuned LoRA weights...")
model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)
model.eval()
print("‚úÖ Fine-tuned O/L History Model Loaded Successfully")

# =======================
# STEP 9 ‚Äî Load Semantic Similarity Model
# =======================
print("\nüîÑ Loading semantic similarity model...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
print("‚úÖ Semantic Similarity Model Loaded")

# =======================
# STEP 10 ‚Äî Load Dataset (Optional)
# =======================
try:
    dataset_path = "/content/drive/MyDrive/Model/Final.xlsx"
    df_dataset = pd.read_excel(dataset_path)
    df_dataset.columns = df_dataset.columns.str.strip().str.lower()
    print(f"‚úÖ Reference dataset loaded: {len(df_dataset)} questions")
except Exception as e:
    print(f"‚ö†Ô∏è Could not load reference dataset: {e}")
    df_dataset = None

# =======================
# STEP 11 ‚Äî Generate Correct Answer
# =======================
def generate_correct_answer(question):
    """Generate model answer using fine-tuned LLaMA"""
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in Sri Lankan O/L History. Provide accurate and concise answers to history questions.<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150,
            min_new_tokens=20,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            do_sample=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    full_response = tokenizer.decode(output[0], skip_special_tokens=True)

    if "assistant" in full_response:
        answer = full_response.split("assistant")[-1].strip()
    else:
        answer = full_response.strip()

    # Remove repetitive patterns
    sentences = answer.split('.')
    unique_sentences = []
    seen = set()

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen and len(sentence) > 10:
            unique_sentences.append(sentence)
            seen.add(sentence)
            if len(unique_sentences) >= 5:
                break

    answer = '. '.join(unique_sentences)
    if answer and not answer.endswith('.'):
        answer += '.'

    return answer

# =======================
# STEP 12 ‚Äî Basic Scoring Functions
# =======================
def semantic_similarity(correct, student):
    """Calculate semantic similarity"""
    emb1 = sbert.encode(correct, convert_to_tensor=True)
    emb2 = sbert.encode(student, convert_to_tensor=True)
    score = util.cos_sim(emb1, emb2)
    return round(float(score[0][0]) * 100, 2)

def keyword_overlap_score(correct, student):
    """Calculate keyword overlap"""
    vectorizer = TfidfVectorizer(stop_words='english', max_features=20)

    try:
        tfidf_matrix = vectorizer.fit_transform([correct, student])
        feature_names = vectorizer.get_feature_names_out()

        correct_scores = tfidf_matrix[0].toarray()[0]
        student_scores = tfidf_matrix[1].toarray()[0]

        correct_keywords = set([feature_names[i] for i, score in enumerate(correct_scores) if score > 0])
        student_keywords = set([feature_names[i] for i, score in enumerate(student_scores) if score > 0])

        if len(correct_keywords) == 0:
            return 0.0

        overlap = len(correct_keywords.intersection(student_keywords))
        overlap_score = (overlap / len(correct_keywords)) * 100

        return round(overlap_score, 2)
    except:
        return 0.0

def length_penalty(correct, student):
    """Penalize inappropriate length"""
    correct_words = len(correct.split())
    student_words = len(student.split())

    ratio = student_words / correct_words if correct_words > 0 else 0

    if 0.5 <= ratio <= 1.5:
        return 1.0
    elif ratio < 0.5:
        return 0.8
    else:
        return 0.9

# =======================
# STEP 12.5 ‚Äî Historical Error Detection
# =======================
HISTORICAL_ERRORS = {
    # Anachronistic terms (things that didn't exist in that era)
    "british": ["parakramabahu", "ancient", "medieval", "polonnaruwa"],
    "factory": ["parakramabahu", "ancient", "medieval"],
    "industrial": ["parakramabahu", "ancient", "medieval"],
    "technology": ["ancient"],

    # Contradictory statements
    "ignored agriculture": ["parakramabahu", "irrigation"],
    "no contribution": ["parakramabahu", "development"],
    "little impact": ["parakramabahu", "king"],
    "traveling abroad": ["parakramabahu"],
    "did not": ["contribution", "development"],
    "failed": ["king", "successful"],
}

def detect_historical_errors(question, student_answer):
    """
    Detect factual errors and anachronisms in student answer
    Returns penalty factor (0.0 to 1.0)
    """
    question_lower = question.lower()
    answer_lower = student_answer.lower()

    error_count = 0

    # Check for anachronistic or contradictory terms
    for error_term, context_keywords in HISTORICAL_ERRORS.items():
        if error_term in answer_lower:
            # Check if this error is relevant to the question context
            if any(keyword in question_lower for keyword in context_keywords):
                error_count += 1

    # Additional checks for negative statements
    negative_phrases = [
        "did not contribute",
        "ignored",
        "little impact",
        "no impact",
        "failed to",
        "mainly known for wars"
    ]

    for phrase in negative_phrases:
        if phrase in answer_lower:
            error_count += 1

    # Calculate penalty
    if error_count >= 4:
        return 0.2  # Severe errors
    elif error_count >= 2:
        return 0.4  # Multiple errors
    elif error_count == 1:
        return 0.7  # Single error
    else:
        return 1.0  # No errors

def calculate_final_score(question, correct, student):
    """Calculate final score with error detection"""
    semantic_score = semantic_similarity(correct, student)
    keyword_score = keyword_overlap_score(correct, student)
    length_factor = length_penalty(correct, student)
    error_penalty = detect_historical_errors(question, student)

    final_score = (
        (semantic_score * 0.6) +
        (keyword_score * 0.3)
    ) * length_factor * error_penalty

    return round(final_score, 2), semantic_score, keyword_score, error_penalty

# =======================
# STEP 13 ‚Äî Feedback Generation
# =======================
def generate_feedback(score):
    """Generate feedback based on score"""
    if score >= 60:
        return "Excellent answer! You have clearly understood the historical events and explained them accurately using relevant facts."
    elif score >= 55:
        return "Good answer. The key points are correct and well explained. Your understanding of the topic is clear."
    elif score >= 40:
        return "Your answer shows some understanding. However, important historical points are missing or not clearly explained."
    else:
        return "The answer does not adequately address the question. Please revise the lesson and include accurate historical facts."

# =======================
# STEP 14 ‚Äî Main Evaluation Function
# =======================
def evaluate_student_answer(question, student_answer):
    """Main evaluation function with multi-metric scoring"""
    print("\n" + "="*60)
    print("üîç EVALUATING ANSWER")
    print("="*60)

    print("üìù Generating model answer...")
    correct_answer = generate_correct_answer(question)

    print("üî¢ Calculating similarity scores...")
    final_score, semantic_score, keyword_score, error_penalty = calculate_final_score(question, correct_answer, student_answer)

    feedback = generate_feedback(final_score)

    # Add error detection info
    if error_penalty < 1.0:
        error_msg = f"\n‚ö†Ô∏è Historical errors detected (penalty: {int((1-error_penalty)*100)}%)"
    else:
        error_msg = ""

    # Adjusted thresholds to be more lenient
    if final_score >= 60:
        result = "‚úÖ Correct"
        status = "PASS"
    elif final_score >= 50:
        result = "‚ö†Ô∏è  Partially Correct"
        status = "NEEDS IMPROVEMENT"
    else:
        result = "‚ùå Incorrect"
        status = "FAIL"

    print("‚úÖ Evaluation complete!")

    return {
        "Question": question,
        "Student Answer": student_answer,
        "Result": result,
        "Status": status,
        "Final Score (%)": final_score,
        "Semantic Similarity (%)": semantic_score,
        "Keyword Match (%)": keyword_score,
        "Error Penalty": f"{int(error_penalty*100)}%",
        "Feedback": feedback + error_msg,
        "Model Answer": correct_answer
    }

# =======================
# STEP 15 ‚Äî Display Results
# =======================
def display_results(result):
    """Display evaluation results"""
    print("\n" + "="*60)
    print("üìä EVALUATION RESULTS")
    print("="*60)
    print(f"\nüìù Question:\n{result['Question']}")
    print(f"\n‚úçÔ∏è  Your Answer:\n{result['Student Answer']}")
    print(f"\n{result['Result']} - {result['Status']}")
    print(f"\nüìà Scoring Breakdown:")
    print(f"   ‚Ä¢ Final Score: {result['Final Score (%)']}%")
    print(f"   ‚Ä¢ Semantic Similarity: {result['Semantic Similarity (%)']}%")
    print(f"   ‚Ä¢ Keyword Match: {result['Keyword Match (%)']}%")
    print(f"   ‚Ä¢ Error Penalty Applied: {result['Error Penalty']}")
    print(f"\nüí¨ Feedback:\n{result['Feedback']}")
    print(f"\n‚úÖ Model Answer:\n{result['Model Answer']}")
    print("="*60 + "\n")

# =======================
# STEP 16 ‚Äî Interactive Function
# =======================
def run_evaluation():
    """Interactive evaluation"""
    print("\n" + "="*60)
    print("üìù STUDENT ANSWER EVALUATION")
    print("="*60)

    question = input("\nüìö Enter the question: ").strip()
    student_answer = input("\n‚úçÔ∏è  Enter the student's answer: ").strip()

    if not question or not student_answer:
        print("‚ùå Question and answer cannot be empty!")
        return

    result = evaluate_student_answer(question, student_answer)
    display_results(result)

    return result

# =======================
# SYSTEM READY
# =======================
print("\n" + "="*60)
print("üéì O/L HISTORY EVALUATION SYSTEM READY!")
print("="*60)
print("\nüìö Usage:")
print("  ‚Ä¢ run_evaluation()  - Interactive mode")
print("  ‚Ä¢ evaluate_student_answer(q, a)  - Direct evaluation")
print("="*60)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

üîê HUGGING FACE AUTHENTICATION
Enter your Hugging Face token for model access

Enter your Hugging Face token: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ Successfully authenticated with Hugging Face!
‚úÖ All libraries imported

üîÑ Loading tokenizer...
‚úÖ Tokenizer loaded

üîÑ Loading base model with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

‚úÖ Base model loaded

üîÑ Loading fine-tuned LoRA weights...
‚úÖ Fine-tuned O/L History Model Loaded Successfully

üîÑ Loading semantic similarity model...
‚úÖ Semantic Similarity Model Loaded
‚úÖ Reference dataset loaded: 806 questions

üéì O/L HISTORY EVALUATION SYSTEM READY!

üìö Usage:
  ‚Ä¢ run_evaluation()  - Interactive mode
  ‚Ä¢ evaluate_student_answer(q, a)  - Direct evaluation


In [None]:
# =======================
# STEP 11 ‚Äî INTERACTIVE MODE
# =======================
print("\nüéì O/L HISTORY ANSWER EVALUATION SYSTEM")
question = input("\nüìö Enter the question: ").strip()
student_answer = input("\n‚úçÔ∏è Enter the student's answer: ").strip()

if question and student_answer:
    result = evaluate_student_answer(question, student_answer)
    display_results(result)
else:
    print("‚ùå Question and answer cannot be empty")


üéì O/L HISTORY ANSWER EVALUATION SYSTEM

üìö Enter the question: Explain the contributions of King Parakramabahu I to the development of Sri Lanka.

‚úçÔ∏è Enter the student's answer: King Parakramabahu I was mainly known for leading wars against the British and building large factories for trade. He ignored agriculture and did not contribute to religion or education. Most of his reign was spent traveling abroad and he had little impact on the development of Sri Lanka.

üîç EVALUATING ANSWER
üìù Generating model answer...
üî¢ Calculating similarity scores...
‚úÖ Evaluation complete!

üìä EVALUATION RESULTS

üìù Question:
Explain the contributions of King Parakramabahu I to the development of Sri Lanka.

‚úçÔ∏è  Your Answer:
King Parakramabahu I was mainly known for leading wars against the British and building large factories for trade. He ignored agriculture and did not contribute to religion or education. Most of his reign was spent traveling abroad and he had little impact 