In [1]:
import unittest
import os 
import openai
from dotenv import load_dotenv
from agentneo.evaluation.metrics.learning_adaptability_rate import execute_learning_adaptability_rate_metric


INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"


In [2]:
import unittest
from unittest.mock import patch, MagicMock
import json

In [3]:
# Load environment variables
load_dotenv("/Users/ejaz/Desktop/hackathon/clone3/AgentNeo/key.env")


# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")



In [8]:
from agentneo.evaluation.metrics.learning_adaptability_rate import (
    extract_tool_changes,
    analyze_post_change_performance,
    calculate_overall_lar,
    evaluate_learning_adaptability,
    execute_learning_adaptability_rate_metric
)

In [9]:
import unittest
from unittest.mock import patch, MagicMock
import json

class TestLearningAdaptabilityRateMetric(unittest.TestCase):
    def setUp(self):
        self.basic_config = {
            "model": "gpt-4-mini",
            "temperature": 0.0
        }
        
        self.sample_tool_calls = [
            {"name": "tool1", "start_time": "2024-01-01T00:00:00", "output": "success", "error": None},
            {"name": "tool2", "start_time": "2024-01-01T00:01:00", "output": "success", "error": None},
            {"name": "tool1", "start_time": "2024-01-01T00:02:00", "output": "success", "error": None},
        ]

    def test_extract_tool_changes(self):
        """Test detection of tool changes"""
        changes = extract_tool_changes(self.sample_tool_calls)
        
        self.assertEqual(len(changes), 2)
        self.assertEqual(changes[0]["added_tools"], ["tool2"])
        self.assertEqual(changes[0]["removed_tools"], ["tool1"])
        self.assertEqual(changes[1]["added_tools"], ["tool1"])
        self.assertEqual(changes[1]["removed_tools"], ["tool2"])

    def test_analyze_post_change_performance(self):
        """Test analysis of performance after tool changes"""
        change_points = [{
            "timestamp": "2024-01-01T00:01:00",
            "call_index": 1,
            "added_tools": ["tool2"],
            "removed_tools": ["tool1"]
        }]
        
        performance = analyze_post_change_performance(self.sample_tool_calls, change_points)
        
        self.assertEqual(len(performance), 1)
        self.assertEqual(performance[0]["successful_adaptations"], 1)
        self.assertEqual(performance[0]["total_attempts"], 1)

    def test_calculate_overall_lar(self):
        """Test LAR calculation with different scenarios"""
        # Test perfect adaptation
        perfect_data = [{
            "successful_adaptations": 5,
            "total_attempts": 5
        }]
        self.assertAlmostEqual(calculate_overall_lar(perfect_data), 1.0)
        
        # Test partial adaptation
        partial_data = [{
            "successful_adaptations": 3,
            "total_attempts": 5
        }]
        self.assertTrue(0 < calculate_overall_lar(partial_data) < 1)
        
        # Test no attempts
        no_attempts_data = [{
            "successful_adaptations": 0,
            "total_attempts": 0
        }]
        self.assertEqual(calculate_overall_lar(no_attempts_data), 0.0)

    @patch('litellm.completion')
    def test_evaluate_learning_adaptability(self, mock_completion):
        """Test evaluation with mocked LLM responses"""
        mock_response = MagicMock()
        mock_response.choices = [
            MagicMock(message=MagicMock(content=json.dumps({
                "score": 0.8,
                "explanation": "Good adaptation pattern",
                "observations": ["Quick recovery after changes"],
                "recommendations": ["Continue monitoring"]
            })))
        ]
        mock_completion.return_value = mock_response

        performance_data = [{
            "successful_adaptations": 4,
            "total_attempts": 5,
            "change": {
                "timestamp": "2024-01-01T00:00:00",
                "added_tools": ["tool2"],
                "removed_tools": ["tool1"]
            }
        }]

        result = evaluate_learning_adaptability(performance_data, self.basic_config, 0.8)
        
        self.assertIn("score", result)
        self.assertIn("explanation", result)
        self.assertIn("observations", result)
        self.assertIn("recommendations", result)

    def test_error_handling(self):
        """Test error handling in metric execution"""
        # Test with completely invalid trace
        invalid_trace = None
        result = execute_learning_adaptability_rate_metric(invalid_trace, self.basic_config)
        self.assertEqual(result["result"]["score"], 0.0)
        self.assertTrue(
            "Error executing metric" in result["result"]["reason"] or 
            "No LLM calls found" in result["result"]["reason"]
        )

        # Test with missing tool_calls
        invalid_trace = {"invalid": "data"}
        result = execute_learning_adaptability_rate_metric(invalid_trace, self.basic_config)
        self.assertEqual(result["result"]["score"], 0.0)
        self.assertIn("No LLM calls found", result["result"]["reason"])

        # Test with empty tool_calls
        empty_trace = {"tool_calls": []}
        result = execute_learning_adaptability_rate_metric(empty_trace, self.basic_config)
        self.assertEqual(result["result"]["score"], 0.0)
        self.assertIn("No LLM calls found", result["result"]["reason"])

    def test_complete_happy_path(self):
        """Test complete successful execution path"""
        trace = {
            "tool_calls": [
                {"name": "tool1", "start_time": "2024-01-01T00:00:00", "output": "success", "error": None},
                {"name": "tool2", "start_time": "2024-01-01T00:01:00", "output": "success", "error": None},
                {"name": "tool2", "start_time": "2024-01-01T00:02:00", "output": "success", "error": None}
            ]
        }
        
        with patch('litellm.completion') as mock_completion:
            # Mock for evaluate_learning_adaptability
            eval_response = MagicMock()
            eval_response.choices = [
                MagicMock(message=MagicMock(content=json.dumps({
                    "score": 0.8,
                    "explanation": "Good adaptation pattern",
                    "observations": ["Quick recovery"],
                    "recommendations": ["Monitor consistently"]
                })))
            ]
            
            # Mock for generate_reason
            reason_response = MagicMock()
            reason_response.choices = [
                MagicMock(message=MagicMock(content="Agent showed good adaptation capabilities"))
            ]
            
            mock_completion.side_effect = [eval_response, reason_response]
            
            result = execute_learning_adaptability_rate_metric(trace, self.basic_config)
            
            self.assertIn("score", result["result"])
            self.assertIn("detailed_evaluation", result["result"])
            self.assertIn("change_points", result["result"])
            self.assertIn("performance_data", result["result"])
            self.assertIn("reason", result["result"])
            self.assertGreater(result["result"]["score"], 0.0)

def run_tests():
    """
    Run all tests and return the result
    This function is safe to use in both notebook and regular Python environments
    """
    suite = unittest.TestLoader().loadTestsFromTestCase(TestLearningAdaptabilityRateMetric)
    runner = unittest.TextTestRunner(verbosity=2)
    return runner.run(suite)

# For notebook usage
if __name__ == '__main__':
    run_tests()

test_analyze_post_change_performance (__main__.TestLearningAdaptabilityRateMetric)
Test analysis of performance after tool changes ... ok
test_calculate_overall_lar (__main__.TestLearningAdaptabilityRateMetric)
Test LAR calculation with different scenarios ... ok
test_complete_happy_path (__main__.TestLearningAdaptabilityRateMetric)
Test complete successful execution path ... ok
test_error_handling (__main__.TestLearningAdaptabilityRateMetric)
Test error handling in metric execution ... ok
test_evaluate_learning_adaptability (__main__.TestLearningAdaptabilityRateMetric)
Test evaluation with mocked LLM responses ... ok
test_extract_tool_changes (__main__.TestLearningAdaptabilityRateMetric)
Test detection of tool changes ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.011s

OK
