In [None]:
%%capture
!pip install unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Your_finetuned_model", # Can also use normal models
    max_seq_length = max_seq_length,
    dtype = dtype
)

In [None]:
import re
import json
import ast
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
import os
from tqdm import tqdm


def format_input(answer, key_points):
    """Formats input text for inference."""
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: 
Compare the given answer with the key points and identify which key points are missing from the answer. Return only the list of missing key points in JSON format as {{"Points_Missed": [list of missing points]}}.

### Input:
Answer: "{answer}"

Key points that should be included:
{json.dumps(key_points, indent=2)}

### Response:
The missing key points are:
"""

def generate_missing_points(json_input, model, tokenizer, stream=False):
    """Generates missing key points using the model."""
    answer, key_points, reference_output = json_input["Answer"], json_input["key_points"], json_input['PointsMissed']
    formatted_input = format_input(answer, key_points)
    
    inputs = tokenizer([formatted_input], return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, use_cache=True)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # Extract only the response part
    response = decoded_output[0].split("### Response:")[-1].strip()

    # Try multiple parsing approaches
    try:
        # First, try to find JSON structure with {"Points_Missed": [...]}
        match = re.search(r'{"Points_Missed":\s*\[.*?\]}', response, re.DOTALL)
        if match:
            return match.group(0), json.dumps({"Points_Missed": reference_output}, indent=4)
            
        # Second, try to find a list structure
        match = re.search(r'\[(.*?)\]', response, re.DOTALL)
        if match:
            try:
                points_list = json.loads(match.group(0))
                return json.dumps({"Points_Missed": points_list}, indent=4), json.dumps({"Points_Missed": reference_output}, indent=4)
            except json.JSONDecodeError:
                # Try to clean and parse the list
                list_str = match.group(0)
                # Replace single quotes with double quotes for JSON compatibility
                list_str = list_str.replace("'", '"')
                try:
                    points_list = json.loads(list_str)
                    return json.dumps({"Points_Missed": points_list}, indent=4), json.dumps({"Points_Missed": reference_output}, indent=4)
                except:
                    pass
        
        # If no structure found, try to extract points line by line
        lines = [line.strip() for line in response.split('\n') if line.strip()]
        # Remove numbered lists (1. Point)
        cleaned_lines = [re.sub(r'^\d+\.\s*', '', line) for line in lines]
        # Remove any lines that are too short or look like headings
        valid_points = [line for line in cleaned_lines if len(line) > 10 and not line.isupper()]
        
        return json.dumps({"Points_Missed": valid_points}, indent=4), json.dumps({"Points_Missed": reference_output}, indent=4)
            
    except Exception as e:
        print(f"Error parsing output: {e}")
        print(f"Raw response: {response}")
        # Fallback to empty list
        return json.dumps({"Points_Missed": []}, indent=4), json.dumps({"Points_Missed": reference_output}, indent=4)

def calculate_metrics(predicted_json, reference_json):
    """Calculates precision, recall, and F1-score between predicted and reference JSONs."""
    try:
        # Convert JSON strings to Python dictionaries
        if isinstance(predicted_json, str):
            predicted = json.loads(predicted_json)["Points_Missed"]
        else:
            predicted = predicted_json["Points_Missed"]
            
        if isinstance(reference_json, str):
            reference = json.loads(reference_json)["Points_Missed"]
        else:
            reference = reference_json["Points_Missed"]

        # Convert to sets for comparison
        predicted_set = set(predicted)
        reference_set = set(reference)

        # Calculate precision, recall, and F1-score
        true_positives = len(predicted_set & reference_set)
        false_positives = len(predicted_set - reference_set)
        false_negatives = len(reference_set - predicted_set)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        return {
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "true_positives": true_positives,
            "false_positives": false_positives,
            "false_negatives": false_negatives
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return {
            "precision": 0,
            "recall": 0,
            "f1_score": 0,
            "true_positives": 0,
            "false_positives": 0,
            "false_negatives": 0,
            "error": str(e)
        }

def evaluate_examples(examples, model, tokenizer, output_file="evaluation_results.json"):
    """Evaluate all examples and save results to a file."""
    all_metrics = []
    individual_results = []
    
    for i, example in enumerate(tqdm(examples, desc="Processing examples")):
        try:
            generated_output, reference_output = generate_missing_points(example, model, tokenizer)
            metrics = calculate_metrics(generated_output, reference_output)
            all_metrics.append(metrics)
            
            # Store individual results
            individual_results.append({
                "example_id": i,
                "answer_text": example.get('Answer', '')[:100] + "...",  # Store first 100 chars
                "generated": json.loads(generated_output) if isinstance(generated_output, str) else generated_output,
                "reference": json.loads(reference_output) if isinstance(reference_output, str) else reference_output,
                "metrics": metrics
            })
            
            if (i+1) % 5 == 0:
                print(f"Processed {i+1}/{len(examples)} examples")
        except Exception as e:
            print(f"Error processing example {i+1}: {str(e)}")
            individual_results.append({
                "example_id": i,
                "error": str(e)
            })
    
    # Calculate average metrics
    if all_metrics:
        avg_precision = np.mean([m["precision"] for m in all_metrics])
        avg_recall = np.mean([m["recall"] for m in all_metrics])
        avg_f1 = np.mean([m["f1_score"] for m in all_metrics])
        
        avg_metrics = {
            "avg_precision": float(avg_precision),
            "avg_recall": float(avg_recall),
            "avg_f1_score": float(avg_f1)
        }
    else:
        avg_metrics = {
            "avg_precision": 0.0,
            "avg_recall": 0.0,
            "avg_f1_score": 0.0
        }
    
    # Save results to a file
    results = {
        "average_metrics": avg_metrics,
        "individual_results": individual_results
    }
    
    with open(output_file, "w") as f:
        json.dump(results, f, indent=4)
    
    return avg_metrics, individual_results

# Main function for Kaggle environment
def main():
    # Path to the test dataset
    input_file = "/kaggle/input/test-dataset/test.json"
    
    # Output file for results
    output_file = "/kaggle/working/evaluation_results.json"
    
    
    # Load examples from JSON file
    with open(input_file, 'r') as f:
        examples = json.load(f)
    
    print(f"Loaded {len(examples)} examples from {input_file}")
    
    # Evaluate examples
    avg_metrics, individual_results = evaluate_examples(examples, model, tokenizer, output_file)
    
    # Print average metrics
    print("\n=== AVERAGE METRICS ===")
    print(f"Average Precision: {avg_metrics['avg_precision']:.4f}")
    print(f"Average Recall: {avg_metrics['avg_recall']:.4f}")
    print(f"Average F1-score: {avg_metrics['avg_f1_score']:.4f}")
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()