In [2]:
# Milestone 1: Task Planning for Financial Agent
# This notebook demonstrates the write_todos tool and basic planning

# Cell 1: Setup
import sys
sys.path.append('..')

from config.settings import settings
from state.agent_state import create_initial_state
from agents.main_agent import financial_agent

# Validate configuration
settings.validate()



ModuleNotFoundError: No module named 'langchain_core'

In [None]:
# Cell 2: Test Case 1 - Retirement Planning
print("=" * 60)
print("TEST 1: Retirement Planning")
print("=" * 60)

request_1 = """
I'm 35 years old, making $80,000/year. I have $50,000 in student loans 
and $10,000 in credit card debt. I want to retire at 65 with a comfortable 
lifestyle. Can you help me create a comprehensive financial plan?
"""

state_1 = create_initial_state(request_1, max_iterations=5)
result_1 = financial_agent.invoke(state_1)

print(f"\nüìã Generated {len(result_1['todos'])} tasks:")
for todo in result_1['todos']:
    print(f"  {todo['id'] + 1}. {todo['description']}")



In [None]:
# Cell 3: Test Case 2 - Emergency Fund
print("\n" + "=" * 60)
print("TEST 2: Emergency Fund Planning")
print("=" * 60)

request_2 = """
I need to build an emergency fund. I have no savings currently, 
make $4,000/month, and spend about $3,500/month. How should I approach this?
"""

state_2 = create_initial_state(request_2, max_iterations=5)
result_2 = financial_agent.invoke(state_2)

print(f"\nüìã Generated {len(result_2['todos'])} tasks:")
for todo in result_2['todos']:
    print(f"  {todo['id'] + 1}. {todo['description']}")



In [None]:
# Cell 4: Test Case 3 - Investment Strategy
print("\n" + "=" * 60)
print("TEST 3: Investment Portfolio Rebalancing")
print("=" * 60)

request_3 = """
I'm 45 years old with $200,000 in my 401(k), all in stocks. I'm worried 
about market volatility as I approach retirement. Should I rebalance? 
What's the right asset allocation for my age?
"""

state_3 = create_initial_state(request_3, max_iterations=5)
result_3 = financial_agent.invoke(state_3)

print(f"\nüìã Generated {len(result_3['todos'])} tasks:")
for todo in result_3['todos']:
    print(f"  {todo['id'] + 1}. {todo['description']}")



In [None]:
# Cell 5: View Full Trace in LangSmith
print("\n" + "=" * 60)
print("üìä EVALUATION INSTRUCTIONS")
print("=" * 60)
print("""
Now go to your LangSmith dashboard:
1. Navigate to: https://smith.langchain.com/
2. Click on your project: 'financial-planning-agent'
3. You should see 3 runs (one for each test)
4. Click on each run to see the trace tree
5. Look for the 'write_todos' tool call and verify the tasks

SUCCESS CRITERIA:
‚úÖ Agent uses write_todos in all 3 tests
‚úÖ Generated task lists are logical and comprehensive
‚úÖ Tasks are specific to the financial scenario
‚úÖ Tasks follow a sensible execution order
""")



In [None]:
# Cell 6: Inspect State Directly
print("\n" + "=" * 60)
print("üîç STATE INSPECTION")
print("=" * 60)

print(f"\nTest 1 State:")
print(f"  Todos created: {len(result_1['todos'])}")
print(f"  Iterations: {result_1['iteration_count']}")
print(f"  Messages exchanged: {len(result_1['messages'])}")

print(f"\nTest 2 State:")
print(f"  Todos created: {len(result_2['todos'])}")
print(f"  Iterations: {result_2['iteration_count']}")

print(f"\nTest 3 State:")
print(f"  Todos created: {len(result_3['todos'])}")
print(f"  Iterations: {result_3['iteration_count']}")



In [None]:
# Cell 7: Manual Quality Check
print("\n" + "=" * 60)
print("‚úÖ MANUAL QUALITY CHECK")
print("=" * 60)

def evaluate_task_quality(todos, context):
    """Simple quality check"""
    score = 0
    total = 3
    
    # Check 1: Has tasks
    if len(todos) >= 3:
        score += 1
        print(f"‚úÖ Has sufficient tasks ({len(todos)} tasks)")
    else:
        print(f"‚ùå Too few tasks ({len(todos)} tasks)")
    
    # Check 2: Tasks are specific (not generic)
    specific_keywords = ['calculate', 'analyze', 'gather', 'estimate', 'create', 'develop']
    specific_count = sum(1 for todo in todos if any(kw in todo['description'].lower() for kw in specific_keywords))
    if specific_count >= len(todos) * 0.6:
        score += 1
        print(f"‚úÖ Tasks are specific and actionable")
    else:
        print(f"‚ùå Tasks are too generic")
    
    # Check 3: Relevant to context
    context_keywords = context.lower().split()[:10]
    relevant_count = sum(1 for todo in todos if any(kw in todo['description'].lower() for kw in context_keywords))
    if relevant_count >= 1:
        score += 1
        print(f"‚úÖ Tasks are relevant to user request")
    else:
        print(f"‚ùå Tasks don't match user context")
    
    print(f"\nüìä Quality Score: {score}/{total} ({score/total*100:.0f}%)")
    return score / total

print("\nTest 1 Quality:")
eval_1 = evaluate_task_quality(result_1['todos'], request_1)

print("\nTest 2 Quality:")
eval_2 = evaluate_task_quality(result_2['todos'], request_2)

print("\nTest 3 Quality:")
eval_3 = evaluate_task_quality(result_3['todos'], request_3)

avg_score = (eval_1 + eval_2 + eval_3) / 3
print(f"\nüéØ MILESTONE 1 SUCCESS RATE: {avg_score*100:.0f}%")
print(f"{'‚úÖ PASSED' if avg_score >= 0.8 else '‚ùå NEEDS IMPROVEMENT'} (Target: 80%)")