In [1]:
!pip install -q transformers
!pip install -q datasets
!pip install -q peft
!pip install -q trl
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q torch torchvision torchaudio


# Cell 1: Import all required libraries
import torch
import json
import pandas as pd
import os
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import login


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2: Check GPU and authenticate with Hugging Face
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print(" No GPU detected! Make sure to enable GPU in Runtime > Change runtime type")

# Safe authentication
print("\n Authenticating with Hugging Face...")
login()
print(" Authentication successful!")

Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB

 Authenticating with Hugging Face...
 Authentication successful!


In [None]:
# Cell 3: Load your training data (make sure nlu_train.jsonl is uploaded first!)
def load_training_data(file_path):
    """Load training data from JSONL file"""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))

    print(f"Total training examples: {len(data)}")
    print("\nFirst example:")
    print(f"Prompt: {data[0]['prompt']}")
    print(f"Completion: {data[0]['completion']}")

    return data

# Load your data
data = load_training_data('nlu_train.jsonl')
print(f" Data loaded successfully! {len(data)} examples ready for training")

Total training examples: 918

First example:
Prompt: Go to the Lenovo lab and bring me a pen.
Completion: {"operations": ["navigate (go)", "grasp (bring)", "deliver (bring)"], "objects": ["Lenovo lab", "pen"]}
✅ Data loaded successfully! 918 examples ready for training


In [None]:
# Cell 4: Load Mistral model and tokenizer
model_name = "mistralai/Mistral-7B-v0.1"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_8bit=True
)

print(" Model and tokenizer loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size:,}")

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✅ Model and tokenizer loaded successfully!
Model parameters: 7,241,732,096
Tokenizer vocab size: 32,000


In [None]:
# Cell 5: Prepare training dataset
def format_training_example(example):
    """Format each training example for instruction following"""
    prompt = example['prompt']
    completion = example['completion']
    formatted_text = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST] {completion}</s>"
    return formatted_text

def prepare_dataset(data):
    """Convert data to HuggingFace dataset format"""
    formatted_texts = [format_training_example(example) for example in data]
    dataset = Dataset.from_dict({"text": formatted_texts})

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=128,
            return_tensors=None
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    return tokenized_dataset

print("Preparing training dataset...")
train_dataset = prepare_dataset(data)

print(f" Dataset prepared!")
print(f"Number of training examples: {len(train_dataset)}")
print(f"Sample token length: {len(train_dataset[0]['input_ids'])}")

Preparing training dataset...


Map:   0%|          | 0/918 [00:00<?, ? examples/s]

✅ Dataset prepared!
Number of training examples: 918
Sample token length: 67


In [None]:
# Cell 6: Setup LoRA configuration (FIXED for gradient error)
# Clear memory first
torch.cuda.empty_cache()
gc.collect()

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none"
)

print("Applying LoRA configuration...")
model = get_peft_model(model, lora_config)

# CRITICAL: Properly enable training mode and gradients
model.train()

# Force enable gradients for all LoRA parameters
print("Enabling gradients for LoRA parameters...")
for name, param in model.named_parameters():
    if param.requires_grad:
        param.requires_grad_(True)
        # Clear any existing gradients
        param.grad = None

# Additional verification and fixes
if hasattr(model, 'enable_input_require_grads'):
    model.enable_input_require_grads()

# Verify gradients are working
test_input = torch.randint(0, 1000, (1, 10)).to(model.device)
try:
    output = model(test_input)
    loss = output.logits.mean()
    loss.backward()
    print(" Gradient test passed!")

    # Clear test gradients
    model.zero_grad()
except Exception as e:
    print(f" Gradient test failed: {e}")

    # Alternative fix - reload model
    print("Trying alternative approach...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_8bit=True
    )
    model = get_peft_model(model, lora_config)
    model.train()

# Print trainable parameters
model.print_trainable_parameters()

# Final verification
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n LoRA setup complete!")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model in training mode: {model.training}")

if trainable_params == 0:
    print(" ERROR: No trainable parameters! Something went wrong.")
else:
    print(" Ready for training!")

Applying LoRA configuration...
Enabling gradients for LoRA parameters...
✅ Gradient test passed!
trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940

✅ LoRA setup complete!
Trainable parameters: 6,815,744
Model in training mode: True
✅ Ready for training!


In [None]:
# Cell 7: Setup training arguments and Google Drive (FIXED for wandb)
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory
checkpoint_dir = "/content/drive/MyDrive/mistral_nlu_finetuning"
os.makedirs(checkpoint_dir, exist_ok=True)
print(f" Created checkpoint directory: {checkpoint_dir}")

# Disable wandb completely
import os
os.environ["WANDB_DISABLED"] = "true"

# Training arguments - FIXED for stability and wandb
training_args = TrainingArguments(
    output_dir=checkpoint_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduced for stability
    gradient_accumulation_steps=8,  # Increased to maintain effective batch size
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=50,  # More frequent saves
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    push_to_hub=False,

    # FIXED: Disable all external logging
    report_to=[],  # Disable wandb, tensorboard, etc.
    logging_dir=None,  # No logging directory

    # FIXED: More stable mixed precision settings
    fp16=False,  # Disable FP16 to avoid scaling issues
    bf16=False,  # Disable BF16 as well
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    warmup_steps=25,
    seed=42,

    # Additional stability settings
    dataloader_drop_last=False,
    ignore_data_skip=True,
    ddp_find_unused_parameters=False,
)

print(" Training arguments configured for stability!")
print(" Wandb logging disabled!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: ~{len(train_dataset) // 8 * 3}")  # Rough estimate

Mounted at /content/drive
✅ Created checkpoint directory: /content/drive/MyDrive/mistral_nlu_finetuning
✅ Training arguments configured for stability!
✅ Wandb logging disabled!
Effective batch size: 8
Total training steps: ~342


In [None]:
# Cell 8: Initialize trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized successfully!")
print(f" Dataset size: {len(train_dataset)} examples")
print(f" Epochs: {training_args.num_train_epochs}")
print(f" Learning rate: {training_args.learning_rate}")
print(f" Checkpoints will save to: {training_args.output_dir}")

# Check GPU memory
if torch.cuda.is_available():
    print(f"\n  GPU Memory before training:")
    print(f"   Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"   Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

print("\n Ready to start training!")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Trainer initialized successfully!
📊 Dataset size: 918 examples
⏰ Epochs: 3
🎯 Learning rate: 0.0002
💾 Checkpoints will save to: /content/drive/MyDrive/mistral_nlu_finetuning

🖥️  GPU Memory before training:
   Allocated: 7.05 GB
   Reserved: 7.64 GB

✅ Ready to start training!


In [None]:
# Cell 9: Start training
print("Starting fine-tuning...")
print("⏰ Estimated time: 30-45 minutes on T4")
print(" Checkpoints will save automatically every 100 steps")
print("\n" + "="*50)
#login(token="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")
login()
# Start training!
trainer.train()

print("\n" + "="*50)
print(" TRAINING COMPLETED!")
print(" Model fine-tuned successfully!")
print(" Checkpoints saved to Google Drive")

🚀 Starting fine-tuning...
⏰ Estimated time: 30-45 minutes on T4
💾 Checkpoints will save automatically every 100 steps



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Step,Training Loss
10,2.4268
20,1.489
30,1.0877
40,0.8944
50,0.7506
60,0.69
70,0.536
80,0.448
90,0.3716
100,0.4169


evaluate model

for final model

In [None]:
# Cell 11: Test the fine-tuned model (run AFTER training completes)

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import pandas as pd

def load_fine_tuned_model():
    """Load the fine-tuned model from Google Drive"""
    print(" Loading fine-tuned model...")

    # Load base model
    base_model_name = "mistralai/Mistral-7B-v0.1"
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_8bit=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load fine-tuned LoRA weights
    model_path = "/content/drive/MyDrive/mistral_nlu_finetuning/final_model"
    model = PeftModel.from_pretrained(base_model, model_path)

    print("Fine-tuned model loaded successfully!")
    return model, tokenizer

def load_test_data():
    """Load test data from test_nlu.jsonl"""
    print(" Loading test data...")

    test_data = []
    with open('test_nlu.jsonl', 'r') as f:
        for line in f:
            test_data.append(json.loads(line.strip()))

    print(f" Loaded {len(test_data)} test examples")
    print(f"First example: {test_data[0]['prompt']}")
    return test_data

def generate_prediction(model, tokenizer, prompt):
    """Generate prediction for a single prompt"""
    # Format prompt like training data
    formatted_prompt = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST]"

    # Tokenize
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,  # Low temperature for consistent output
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the new tokens (response)
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

def evaluate_model(model, tokenizer, test_data, num_samples=None):
    """Evaluate model on test data"""
    if num_samples is None:
        num_samples = len(test_data)

    num_to_test = min(num_samples, len(test_data))
    print(f" Testing model on {num_to_test} examples...")

    results = []
    correct_format_count = 0
    exact_match_count = 0

    for i, example in enumerate(test_data[:num_to_test]):
        prompt = example['prompt']
        expected = example['completion']

        print(f"\n" + "="*80)
        print(f" EXAMPLE {i+1}/{num_to_test}")
        print(f"="*80)
        print(f" INPUT PROMPT:")
        print(f"   {prompt}")
        print(f"\n EXPECTED OUTPUT:")
        print(f"   {expected}")

        # Generate prediction
        try:
            prediction = generate_prediction(model, tokenizer, prompt)
            print(f"\n MODEL OUTPUT:")
            print(f"   {prediction}")

            # Try to parse both as JSON to check format
            try:
                pred_json = json.loads(prediction)
                expected_json = json.loads(expected)

                print(f"\n PARSED RESULTS:")
                print(f"   Expected Operations: {expected_json.get('operations', [])}")
                print(f"   Predicted Operations: {pred_json.get('operations', [])}")
                print(f"   Expected Objects: {expected_json.get('objects', [])}")
                print(f"   Predicted Objects: {pred_json.get('objects', [])}")

                # Check if structure matches
                has_operations = 'operations' in pred_json
                has_objects = 'objects' in pred_json
                format_correct = has_operations and has_objects

                # Check for exact match
                exact_match = (pred_json == expected_json)

                print(f"\n EVALUATION:")
                if format_correct:
                    correct_format_count += 1
                    print(f"   JSON Format: CORRECT")

                    if exact_match:
                        exact_match_count += 1
                        print(f"   🎯 Exact Match: YES - Perfect!")
                    else:
                        print(f"   📝 Exact Match: NO - But format is correct")

                        # Show differences
                        ops_match = pred_json.get('operations') == expected_json.get('operations')
                        obj_match = pred_json.get('objects') == expected_json.get('objects')
                        print(f"   Operations Match: {'' if ops_match else ''}")
                        print(f"   Objects Match: {'' if obj_match else ''}")
                else:
                    print(f"    JSON Format: INCORRECT - missing operations or objects")

            except json.JSONDecodeError as e:
                format_correct = False
                exact_match = False
                print(f"\n JSON PARSING ERROR:")
                print(f"   {str(e)}")
                print(f"   Raw output cannot be parsed as valid JSON")

        except Exception as e:
            prediction = f"Generation Error: {str(e)}"
            format_correct = False
            exact_match = False
            print(f"\n MODEL GENERATION ERROR:")
            print(f"   {str(e)}")

        results.append({
            'example_id': i+1,
            'prompt': prompt,
            'expected': expected,
            'predicted': prediction,
            'format_correct': format_correct,
            'exact_match': exact_match
        })

    # Summary statistics
    total_tests = len(results)
    format_accuracy = (correct_format_count / total_tests) * 100
    exact_accuracy = (exact_match_count / total_tests) * 100

    print(f"\n" + "="*80)
    print(f" FINAL EVALUATION SUMMARY")
    print(f"="*80)
    print(f" Total tests: {total_tests}")
    print(f" Correct JSON format: {correct_format_count}/{total_tests} ({format_accuracy:.1f}%)")
    print(f" Exact matches: {exact_match_count}/{total_tests} ({exact_accuracy:.1f}%)")
    print(f"="*80)

    return results, {
        'total_tests': total_tests,
        'correct_format': correct_format_count,
        'exact_matches': exact_match_count,
        'format_accuracy': format_accuracy,
        'exact_accuracy': exact_accuracy
    }

# Main execution
if __name__ == "__main__":
    # Load model and tokenizer
    model, tokenizer = load_fine_tuned_model()

    # Load test data (make sure test_nlu.jsonl is uploaded!)
    test_data = load_test_data()

    # Evaluate model on all test examples
    print(f"Starting evaluation on all {len(test_data)} test examples...")
    results, summary = evaluate_model(model, tokenizer, test_data)

    # Save detailed results
    results_df = pd.DataFrame(results)
    results_path = '/content/drive/MyDrive/mistral_nlu_finetuning/test_results.csv'
    results_df.to_csv(results_path, index=False)

    # Save summary
    summary_path = '/content/drive/MyDrive/mistral_nlu_finetuning/evaluation_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\n Detailed results saved to: test_results.csv")
    print(f" Summary saved to: evaluation_summary.json")
    print(f"\n FINAL RESULTS:")
    print(f"   Format Accuracy: {summary['format_accuracy']:.1f}%")
    print(f"   Exact Match Accuracy: {summary['exact_accuracy']:.1f}%")

    print("\n Model evaluation complete!")

for check point

In [14]:
# Cell 11: Test the checkpoint-250 model

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import pandas as pd

# Mount Google Drive first
from google.colab import drive
drive.mount('/content/drive')

def load_checkpoint_model(checkpoint_number=250):
    """Load model from specific checkpoint with memory optimization"""
    print(f"Loading model from checkpoint-{checkpoint_number}...")

    # Clear GPU memory first
    import gc
    torch.cuda.empty_cache()
    gc.collect()

    # Load base model with optimized settings for inference
    base_model_name = "mistralai/Mistral-7B-v0.1"

    # Create custom device map for better memory management
    device_map = {
        "model.embed_tokens": "cuda:0",
        "model.layers": "cuda:0",
        "model.norm": "cuda:0",
        "lm_head": "cuda:0"
    }

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map=device_map,
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True,  # Enable CPU offload
        llm_int8_threshold=6.0,  # Lower threshold for more aggressive quantization
        trust_remote_code=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load specific checkpoint LoRA weights
    checkpoint_path = f"/content/drive/MyDrive/mistral_nlu_finetuning/checkpoint-{checkpoint_number}"

    # Check if checkpoint exists
    import os
    if not os.path.exists(checkpoint_path):
        print(f"Checkpoint {checkpoint_number} not found!")
        print("Available checkpoints:")
        base_dir = "/content/drive/MyDrive/mistral_nlu_finetuning"
        checkpoints = [d for d in os.listdir(base_dir) if d.startswith('checkpoint-')]
        for cp in sorted(checkpoints):
            print(f"   - {cp}")
        return None, None

    # Load PEFT model with memory optimization
    model = PeftModel.from_pretrained(
        base_model,
        checkpoint_path,
        torch_dtype=torch.float16,
        device_map={"": "cuda:0"}
    )

    # Set to evaluation mode to save memory
    model.eval()

    print(f"Checkpoint-{checkpoint_number} loaded successfully!")
    print(f"Loaded from: {checkpoint_path}")

    # Check memory usage
    if torch.cuda.is_available():
        print(f"GPU Memory after loading:")
        print(f"   Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        print(f"   Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

    return model, tokenizer

def load_test_data():
    """Load test data from test_nlu.jsonl"""
    print("📁 Loading test data...")

    test_data = []
    with open('nlu_test.jsonl', 'r') as f:
        for line in f:
            test_data.append(json.loads(line.strip()))

    print(f"✅ Loaded {len(test_data)} test examples")
    print(f"First example: {test_data[0]['prompt']}")
    return test_data[:5]

def generate_prediction(model, tokenizer, prompt):
    """Generate prediction for a single prompt"""
    # Format prompt like training data
    formatted_prompt = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST]"

    # Tokenize
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,  # Low temperature for consistent output
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the new tokens (response)
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

def evaluate_model(model, tokenizer, test_data, checkpoint_num, num_samples=None):
    """Evaluate model on test data"""
    if num_samples is None:
        num_samples = len(test_data)

    num_to_test = min(num_samples, len(test_data))
    print(f"🧪 Testing checkpoint-{checkpoint_num} on {num_to_test} examples...")

    results = []
    correct_format_count = 0
    exact_match_count = 0

    for i, example in enumerate(test_data[:num_to_test]):
        prompt = example['prompt']
        expected = example['completion']

        print(f"\n" + "="*80)
        print(f"📝 EXAMPLE {i+1}/{num_to_test} (Checkpoint-{checkpoint_num})")
        print(f"="*80)
        print(f"🎯 INPUT PROMPT:")
        print(f"   {prompt}")
        print(f"\n💭 EXPECTED OUTPUT:")
        print(f"   {expected}")

        # Generate prediction
        try:
            prediction = generate_prediction(model, tokenizer, prompt)
            print(f"\n🤖 MODEL OUTPUT (Checkpoint-{checkpoint_num}):")
            print(f"   {prediction}")

            # Try to parse both as JSON to check format
            try:
                pred_json = json.loads(prediction)
                expected_json = json.loads(expected)

                print(f"\n📊 PARSED RESULTS:")
                print(f"   Expected Operations: {expected_json.get('operations', [])}")
                print(f"   Predicted Operations: {pred_json.get('operations', [])}")
                print(f"   Expected Objects: {expected_json.get('objects', [])}")
                print(f"   Predicted Objects: {pred_json.get('objects', [])}")

                # Check if structure matches
                has_operations = 'operations' in pred_json
                has_objects = 'objects' in pred_json
                format_correct = has_operations and has_objects

                # Check for exact match
                exact_match = (pred_json == expected_json)

                print(f"\n✅ EVALUATION:")
                if format_correct:
                    correct_format_count += 1
                    print(f"   ✅ JSON Format: CORRECT")

                    if exact_match:
                        exact_match_count += 1
                        print(f"   🎯 Exact Match: YES - Perfect!")
                    else:
                        print(f"   📝 Exact Match: NO - But format is correct")

                        # Show differences
                        ops_match = pred_json.get('operations') == expected_json.get('operations')
                        obj_match = pred_json.get('objects') == expected_json.get('objects')
                        print(f"   Operations Match: {'✅' if ops_match else '❌'}")
                        print(f"   Objects Match: {'✅' if obj_match else '❌'}")
                else:
                    print(f"   ❌ JSON Format: INCORRECT - missing operations or objects")

            except json.JSONDecodeError as e:
                format_correct = False
                exact_match = False
                print(f"\n❌ JSON PARSING ERROR:")
                print(f"   {str(e)}")
                print(f"   Raw output cannot be parsed as valid JSON")

        except Exception as e:
            prediction = f"Generation Error: {str(e)}"
            format_correct = False
            exact_match = False
            print(f"\n❌ MODEL GENERATION ERROR:")
            print(f"   {str(e)}")

        results.append({
            'checkpoint': checkpoint_num,
            'example_id': i+1,
            'prompt': prompt,
            'expected': expected,
            'predicted': prediction,
            'format_correct': format_correct,
            'exact_match': exact_match
        })

    # Summary statistics
    total_tests = len(results)
    format_accuracy = (correct_format_count / total_tests) * 100
    exact_accuracy = (exact_match_count / total_tests) * 100

    print(f"\n" + "="*80)
    print(f"📊 CHECKPOINT-{checkpoint_num} EVALUATION SUMMARY")
    print(f"="*80)
    print(f"🔢 Total tests: {total_tests}")
    print(f"✅ Correct JSON format: {correct_format_count}/{total_tests} ({format_accuracy:.1f}%)")
    print(f"🎯 Exact matches: {exact_match_count}/{total_tests} ({exact_accuracy:.1f}%)")
    print(f"="*80)

    return results, {
        'checkpoint': checkpoint_num,
        'total_tests': total_tests,
        'correct_format': correct_format_count,
        'exact_matches': exact_match_count,
        'format_accuracy': format_accuracy,
        'exact_accuracy': exact_accuracy
    }

# Main execution for checkpoint testing
if __name__ == "__main__":
    # Load checkpoint-250 model
    model, tokenizer = load_checkpoint_model(250)

    if model is None:
        print(" Failed to load checkpoint. Please check the checkpoint number.")
    else:
        # Load test data (make sure test_nlu.jsonl is uploaded!)
        test_data = load_test_data()

        # Evaluate checkpoint-250 model on all test examples
        print(f"Starting evaluation on checkpoint-250 with all {len(test_data)} test examples...")
        results, summary = evaluate_model(model, tokenizer, test_data, 250)

        # Save detailed results
        results_df = pd.DataFrame(results)
        results_path = '/content/drive/MyDrive/mistral_nlu_finetuning/checkpoint_250_test_results.csv'
        results_df.to_csv(results_path, index=False)

        # Save summary
        summary_path = '/content/drive/MyDrive/mistral_nlu_finetuning/checkpoint_250_evaluation_summary.json'
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)

        print(f"\n Detailed results saved to: checkpoint_250_test_results.csv")
        print(f" Summary saved to: checkpoint_250_evaluation_summary.json")
        print(f"\n CHECKPOINT-250 FINAL RESULTS:")
        print(f"   Format Accuracy: {summary['format_accuracy']:.1f}%")
        print(f"   Exact Match Accuracy: {summary['exact_accuracy']:.1f}%")

        print("\n Checkpoint-250 evaluation complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model from checkpoint-250...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint 250 not found!
Available checkpoints:
   - checkpoint-200
   - checkpoint-250
 Failed to load checkpoint. Please check the checkpoint number.


In [None]:
import os

# Check contents of your training folder
base_path = "/content/drive/MyDrive/mistral_nlu_finetuning"

print(f"Contents of {base_path}:")
for item in os.listdir(base_path):
    item_path = os.path.join(base_path, item)
    if os.path.isdir(item_path):
        print(f"{item}/")
        # If it's a checkpoint folder, show what's inside
        if item.startswith('checkpoint-'):
            print(f"   Contents of {item}:")
            try:
                for subitem in os.listdir(item_path):
                    print(f"     - {subitem}")
            except:
                print("     (cannot read contents)")
    else:
        print(f"📄 {item}")

Contents of /content/drive/MyDrive/mistral_nlu_finetuning:
checkpoint-250/
   Contents of checkpoint-250:
     - optimizer.pt
     - README.md
     - tokenizer_config.json
     - tokenizer.model
     - tokenizer.json
     - adapter_model.safetensors
     - scaler.pt
     - training_args.bin
     - scheduler.pt
     - adapter_config.json
     - special_tokens_map.json
     - rng_state.pth
     - trainer_state.json
runs/
checkpoint-200/
   Contents of checkpoint-200:
     - adapter_config.json
     - special_tokens_map.json
     - README.md
     - scaler.pt
     - trainer_state.json
     - rng_state.pth
     - tokenizer.model
     - training_args.bin
     - tokenizer.json
     - tokenizer_config.json
     - scheduler.pt
     - optimizer.pt
     - adapter_model.safetensors


In [4]:
def test_training_data_memorization():
    """Test if model can reproduce examples it was trained on"""

    # Load some examples from your original training data
    with open('nlu_train.jsonl', 'r') as f:
        train_data = [json.loads(line) for line in f]

    print("Testing model on TRAINING examples (should be perfect):")

    for i, example in enumerate(train_data[:3]):  # Test first 3 training examples
        prompt = example['prompt']
        expected = example['completion']

        print(f"\n--- Training Example {i+1} ---")
        print(f"Prompt: {prompt}")
        print(f"Expected: {expected}")

        prediction = generate_prediction(model, tokenizer, prompt)
        print(f"Model Output: {prediction}")

        if prediction.strip() == expected.strip():
            print("PERFECT MATCH - Model learned this example")
        else:
            print("MISMATCH - Model did not learn this example properly")
test_training_data_memorization()

Testing model on TRAINING examples (should be perfect):

--- Training Example 1 ---
Prompt: Go to the Lenovo lab and bring me a pen.
Expected: {"operations": ["navigate (go)", "grasp (bring)", "deliver (bring)"], "objects": ["Lenovo lab", "pen"]}
Model Output: [INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects
MISMATCH - Model did not learn this example properly

--- Training Example 2 ---
Prompt: Pick up the red marker and place it on the desk.
Expected: {"operations": ["grasp (pick up)", "place (place)"] , "objects": ["marker", "desk"]}
Model Output: [INST] Extract operations and objects from this instruction: Pick up the red marker and place it on the desk. [/INST]

[I

In [5]:
def test_specific_training_example():
    """Test the specific example you mentioned"""

    prompt = "Go to the Lenovo lab and bring me a pen."
    expected = '{"operations": ["navigate (go)", "grasp (bring)", "deliver (bring)"], "objects": ["Lenovo lab", "pen"]}'

    print("=== TESTING SPECIFIC TRAINING EXAMPLE ===")
    print(f"Prompt: {prompt}")
    print(f"Expected: {expected}")

    # Test with current model
    prediction = generate_prediction(model, tokenizer, prompt)
    print(f"Model Output: {prediction}")

    # Check if it matches
    if prediction.strip() == expected.strip():
        print("SUCCESS: Perfect match - model learned this example")
    else:
        print("PROBLEM: Model did not reproduce training example correctly")

        # Try to parse both as JSON to see what's different
        try:
            pred_json = json.loads(prediction)
            exp_json = json.loads(expected)
            print("Both are valid JSON - comparing content...")
            print(f"Expected operations: {exp_json['operations']}")
            print(f"Predicted operations: {pred_json.get('operations', 'MISSING')}")
            print(f"Expected objects: {exp_json['objects']}")
            print(f"Predicted objects: {pred_json.get('objects', 'MISSING')}")
        except json.JSONDecodeError:
            print("Model output is not valid JSON")

    return prediction

# Run the test
result = test_specific_training_example()

=== TESTING SPECIFIC TRAINING EXAMPLE ===
Prompt: Go to the Lenovo lab and bring me a pen.
Expected: {"operations": ["navigate (go)", "grasp (bring)", "deliver (bring)"], "objects": ["Lenovo lab", "pen"]}
Model Output: [INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]

[INST] Extract operations and objects
PROBLEM: Model did not reproduce training example correctly
Model output is not valid JSON


In [6]:
def generate_prediction_fixed(model, tokenizer, prompt):
    """Fixed generation function with proper stopping"""

    # Format prompt like training data
    formatted_prompt = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST]"

    print(f"DEBUG - Input prompt: {formatted_prompt}")

    # Tokenize
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=128
    )

    # Move to device
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    print(f"DEBUG - Input length: {inputs['input_ids'].shape[1]} tokens")

    # Generate with fixed parameters
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=80,
            min_new_tokens=10,  # Force it to generate at least 10 tokens
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.5,  # Prevent repetition
            no_repeat_ngram_size=3,  # Prevent 3-gram repetition
            early_stopping=True
        )

    # Get only the new tokens
    input_length = inputs['input_ids'].shape[1]
    generated_tokens = outputs[0][input_length:]

    print(f"DEBUG - Generated {len(generated_tokens)} new tokens")
    print(f"DEBUG - Generated token IDs: {generated_tokens.tolist()[:10]}...")

    # Decode
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

    print(f"DEBUG - Raw response: '{response}'")

    return response

# Test the fixed function
print("=== TESTING FIXED GENERATION ===")
test_prompt = "Go to the Lenovo lab and bring me a pen."
fixed_result = generate_prediction_fixed(model, tokenizer, test_prompt)
print(f"\nFinal result: {fixed_result}")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== TESTING FIXED GENERATION ===
DEBUG - Input prompt: <s>[INST] Extract operations and objects from this instruction: Go to the Lenovo lab and bring me a pen. [/INST]
DEBUG - Input length: 30 tokens
DEBUG - Generated 80 new tokens
DEBUG - Generated token IDs: [13, 13, 1064, 3133, 8373, 354, 938, 302, 9131, 5225]...
DEBUG - Raw response: '## Instructions for use of extract operation in English grammar

The word “extract” is used as an action verb that means taking something out or removing it, especially when you take liquid (such as juice) away by squeezing fruit with your hands; also refers to getting information about someone’s personality through observation over time without asking them directly what they think themselves – like reading'

Final result: ## Instructions for use of extract operation in English grammar

The word “extract” is used as an action verb that means taking something out or removing it, especially when you take liquid (such as juice) away by squeezing fruit wi

train again from check point

In [11]:
# Simple fresh training with maximum memory optimization

# First, completely restart runtime to clear all memory
print("IMPORTANT: If you get memory errors, go to Runtime > Restart Runtime first")

import torch
import gc
import os

# Clear everything
torch.cuda.empty_cache()
gc.collect()

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

print("Starting fresh training with better settings...")

# Import libraries
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from google.colab import drive

# Mount drive
drive.mount('/content/drive')

# Load model with most aggressive memory settings
print("Loading model with maximum memory optimization...")
base_model_name = "mistralai/Mistral-7B-v0.1"

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,  # Even more aggressive quantization
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    use_cache=False  # Disable cache to save memory
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# Smaller LoRA configuration to save memory
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,  # Smaller rank
    lora_alpha=8,  # Smaller alpha
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],  # Fewer modules
    bias="none"
)

print("Applying smaller LoRA configuration...")
model = get_peft_model(model, lora_config)

# CRITICAL: Proper gradient setup
model.train()

# Force enable gradients for all LoRA parameters
print("Setting up gradients...")
for name, param in model.named_parameters():
    if param.requires_grad:
        param.requires_grad_(True)
        param.grad = None

# Enable input gradients if method exists
if hasattr(model, 'enable_input_require_grads'):
    model.enable_input_require_grads()

# Verify gradients work with a test
print("Testing gradient flow...")
test_input = torch.randint(0, 1000, (1, 10)).to(model.device)
try:
    with torch.enable_grad():
        output = model(test_input)
        loss = output.logits.mean()
        loss.backward()
    print("Gradient test PASSED")
    model.zero_grad()  # Clear test gradients
except Exception as e:
    print(f"Gradient test FAILED: {e}")
    print("Attempting fix...")

    # Alternative fix - explicitly enable gradients
    model.enable_adapters()
    for name, param in model.named_parameters():
        if 'lora' in name.lower():
            param.requires_grad_(True)

    print("Applied alternative gradient fix")

model.print_trainable_parameters()

# Load and prepare data
print("Loading training data...")
data = []
with open('nlu_train.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

def format_training_example(example):
    prompt = example['prompt']
    completion = example['completion']
    formatted_text = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST] {completion}</s>"
    return formatted_text

def prepare_dataset(data):
    formatted_texts = [format_training_example(example) for example in data]
    dataset = Dataset.from_dict({"text": formatted_texts})

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=100,  # Shorter max length
            return_tensors=None
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    return tokenized_dataset

train_dataset = prepare_dataset(data)

# Create output directory
checkpoint_dir = "/content/drive/MyDrive/mistral_nlu_extended"
os.makedirs(checkpoint_dir, exist_ok=True)

# Memory-optimized training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_dir,
    num_train_epochs=6,  # More epochs to compensate for smaller model
    per_device_train_batch_size=1,  # Smallest batch size
    gradient_accumulation_steps=4,  # Smaller accumulation
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=20,
    save_steps=50,
    save_total_limit=3,
    prediction_loss_only=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=[],
    fp16=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    warmup_steps=20,
    seed=42,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Memory-optimized training setup complete!")
print(f"Training for {training_args.num_train_epochs} epochs")
print(f"Smaller LoRA (r=4) to save memory")
print(f"Output: {checkpoint_dir}")

# Check memory
if torch.cuda.is_available():
    print(f"\nGPU Memory:")
    print(f"   Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"   Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

print("\nReady to start training!")
print("Run: trainer.train()")

# To start training immediately, uncomment:
trainer.train()

IMPORTANT: If you get memory errors, go to Runtime > Restart Runtime first
Starting fresh training with better settings...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model with maximum memory optimization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Applying smaller LoRA configuration...
Setting up gradients...
Testing gradient flow...
Gradient test PASSED
trainable params: 1,703,936 || all params: 7,243,436,032 || trainable%: 0.0235
Loading training data...


Map:   0%|          | 0/918 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Memory-optimized training setup complete!
Training for 6 epochs
Smaller LoRA (r=4) to save memory
Output: /content/drive/MyDrive/mistral_nlu_extended

GPU Memory:
   Allocated: 8.36 GB
   Reserved: 11.04 GB

Ready to start training!
Run: trainer.train()


Step,Training Loss
20,3.0273
40,1.3727
60,0.9271
80,0.8285
100,0.7813
120,0.7889
140,0.7272
160,0.7143
180,0.6261
200,0.602


KeyboardInterrupt: 

In [16]:
# Test checkpoint-450 on first 5 samples from test_nlu.jsonl

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gc

def load_checkpoint_450():
    """Load checkpoint-450 with memory optimization"""
    print("Loading checkpoint-450...")

    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()

    # Load base model
    base_model_name = "mistralai/Mistral-7B-v0.1"
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_4bit=True,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load checkpoint-450
    checkpoint_path = "/content/drive/MyDrive/mistral_nlu_extended/checkpoint-450"

    # Check if checkpoint exists
    import os
    if not os.path.exists(checkpoint_path):
        print(f"Checkpoint-450 not found at: {checkpoint_path}")
        print("Available checkpoints:")
        base_dir = "/content/drive/MyDrive/mistral_nlu_extended"
        if os.path.exists(base_dir):
            checkpoints = [d for d in os.listdir(base_dir) if d.startswith('checkpoint-')]
            for cp in sorted(checkpoints, key=lambda x: int(x.split('-')[1])):
                print(f"  {cp}")
        return None, None

    # Load PEFT model
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model.eval()

    print(f"Checkpoint-450 loaded successfully!")
    return model, tokenizer

def load_test_data():
    """Load first 5 samples from test_nlu.jsonl"""
    print("Loading test data...")

    test_data = []
    with open('nlu_test.jsonl', 'r') as f:
        for line in f:
            test_data.append(json.loads(line.strip()))

    print(f"Loaded {len(test_data)} test examples")
    return test_data[:5]  # Only first 5

def generate_prediction(model, tokenizer, prompt):
    """Generate prediction for a single prompt"""
    formatted_prompt = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST]"

    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=100
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

def evaluate_checkpoint_450():
    """Main evaluation function"""

    # Load model and data
    model, tokenizer = load_checkpoint_450()
    if model is None:
        return

    test_data = load_test_data()

    print(f"\n{'='*80}")
    print(f"TESTING CHECKPOINT-450 ON FIRST 5 SAMPLES")
    print(f"{'='*80}")

    results = []
    correct_format = 0
    exact_matches = 0

    for i, example in enumerate(test_data):
        prompt = example['prompt']
        expected = example['completion']

        print(f"\n--- EXAMPLE {i+1}/5 ---")
        print(f"INPUT: {prompt}")
        print(f"EXPECTED: {expected}")

        # Generate prediction
        try:
            prediction = generate_prediction(model, tokenizer, prompt)
            print(f"PREDICTED: {prediction}")

            # Check if valid JSON
            try:
                pred_json = json.loads(prediction)
                expected_json = json.loads(expected)

                has_operations = 'operations' in pred_json
                has_objects = 'objects' in pred_json
                format_correct = has_operations and has_objects
                exact_match = (pred_json == expected_json)

                if format_correct:
                    correct_format += 1
                    print("STATUS: CORRECT JSON FORMAT")

                    if exact_match:
                        exact_matches += 1
                        print("MATCH: PERFECT EXACT MATCH!")
                    else:
                        print("MATCH: PARTIAL - correct format, different content")
                        print(f"  Expected ops: {expected_json.get('operations', [])}")
                        print(f"  Predicted ops: {pred_json.get('operations', [])}")
                        print(f"  Expected objs: {expected_json.get('objects', [])}")
                        print(f"  Predicted objs: {pred_json.get('objects', [])}")
                else:
                    print("STATUS: INCORRECT - missing operations or objects fields")

            except json.JSONDecodeError:
                format_correct = False
                exact_match = False
                print("STATUS: INVALID JSON - cannot parse")

        except Exception as e:
            prediction = f"ERROR: {str(e)}"
            format_correct = False
            exact_match = False
            print(f"STATUS: GENERATION ERROR - {e}")

        results.append({
            'example': i+1,
            'prompt': prompt,
            'expected': expected,
            'predicted': prediction,
            'format_correct': format_correct,
            'exact_match': exact_match
        })

    # Summary
    print(f"\n{'='*80}")
    print(f"CHECKPOINT-450 EVALUATION SUMMARY")
    print(f"{'='*80}")
    print(f"Total samples tested: 5")
    print(f"Correct JSON format: {correct_format}/5 ({correct_format/5*100:.0f}%)")
    print(f"Exact matches: {exact_matches}/5 ({exact_matches/5*100:.0f}%)")

    if correct_format == 5:
        print("RESULT: EXCELLENT - All outputs have correct JSON format!")
    elif correct_format >= 3:
        print("RESULT: GOOD - Most outputs have correct format")
    elif correct_format >= 1:
        print("RESULT: PARTIAL - Some outputs work")
    else:
        print("RESULT: POOR - No valid JSON outputs")

    print(f"{'='*80}")

    return results

# Run the evaluation
if __name__ == "__main__":
    results = evaluate_checkpoint_450()

Loading checkpoint-450...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint-450 loaded successfully!
Loading test data...
Loaded 60 test examples

TESTING CHECKPOINT-450 ON FIRST 5 SAMPLES

--- EXAMPLE 1/5 ---
INPUT: Navigate to the living room and pick up the child's building blocks from the floor.
EXPECTED: {"operations": ["navigate (navigate)", "pick up (pick up)"] , "objects": ["living room", "building blocks", "floor"]}
PREDICTED: {"operations": ["navigate (navigate)", "pick up (pick up)"] , "objects": ["living room", "building blocks"]}
STATUS: CORRECT JSON FORMAT
MATCH: PARTIAL - correct format, different content
  Expected ops: ['navigate (navigate)', 'pick up (pick up)']
  Predicted ops: ['navigate (navigate)', 'pick up (pick up)']
  Expected objs: ['living room', 'building blocks', 'floor']
  Predicted objs: ['living room', 'building blocks']

--- EXAMPLE 2/5 ---
INPUT: Grab the empty laundry basket and take it to the washing machine.
EXPECTED: {"operations": ["grab (grab)"] , "objects": ["laundry basket", "washing machine"]}
PREDICTED: {"

In [18]:
# Simple script to show input and model output only

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gc

def load_checkpoint_450():
    """Load checkpoint-450"""
    print("Loading checkpoint-450...")

    torch.cuda.empty_cache()
    gc.collect()

    base_model_name = "mistralai/Mistral-7B-v0.1"
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_4bit=True,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    checkpoint_path = "/content/drive/MyDrive/mistral_nlu_extended/checkpoint-450"
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model.eval()

    print("Model loaded!")
    return model, tokenizer

def generate_prediction(model, tokenizer, prompt):
    """Generate prediction"""
    formatted_prompt = f"<s>[INST] Extract operations and objects from this instruction: {prompt} [/INST]"

    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=100
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

def test_model():
    """Test model and show simple input/output"""

    # Load model
    model, tokenizer = load_checkpoint_450()

    # Load test data
    test_data = []
    with open('nlu_test.jsonl', 'r') as f:
        for line in f:
            test_data.append(json.loads(line.strip()))

    # Test first 5 examples
    for i, example in enumerate(test_data[:50]):
        prompt = example['prompt']
        output = generate_prediction(model, tokenizer, prompt)

        print(f"\nInput: {prompt}")
        print(f"Output: {output}")

# Run the test
test_model()

Loading checkpoint-450...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded!

Input: Navigate to the living room and pick up the child's building blocks from the floor.
Output: {"operations": ["navigate (navigate)", "pick up (pick up)"] , "objects": ["living room", "building blocks"]}

Input: Grab the empty laundry basket and take it to the washing machine.
Output: {"operations": ["grab (grab)", "navigate (take)"] , "objects": ["laundry basket", "washing machine"]}

Input: Identify the specific type of bird making that loud noise outside the window.
Output: {"operations": ["identify (identify)"] , "objects": ["bird", "noise"]}

Input: Scan the entire shelf for any expired food items.
Output: {"operations": ["scan (scan)"] , "objects": ["shelf", "food items"]}

Input: Push the heavy armchair slightly to the left to clean behind it.
Output: {"operations": ["push (push)"] , "objects": ["armchair", "cleaning"]}

Input: Shift the stack of magazines on the coffee table to make room for a drink.
Output: {"operations": ["shift (shift)"] , "objects": ["sta