In [None]:
!pip install ./funcroute-0.1.0-py3-none-any.whl

In [2]:
from funcroute import FuncRoute, TrainingConfig
from funcroute.core.config import ToolDefinition
from funcroute.data.generator import SyntheticDataGenerator
from funcroute.data.splitter import PatternGroupSplitter
from funcroute.data.validator import DataValidator

In [3]:
from huggingface_hub import login
login(token = "")

In [4]:
def main():
    print("=" * 80)
    print("FuncRoute Simple Example - Complete Workflow")
    print("=" * 80)

    # =========================================================================
    # Step 1: Define Tools
    # =========================================================================
    print("\n[Step 1/6] Defining tools...")

    tools = [
        ToolDefinition(
            name="manage_order",
            signature="manage_order(order_id: str) -> dict",
            description="Track and manage customer orders, check delivery status, and update shipping information",
            examples=[
                "Where is my order?",
                "Track my package",
                "Order status for #12345",
                "When will my order arrive?",
                "Update shipping address",
            ],
            keywords=["order", "track", "package", "delivery", "shipment", "shipping"],
        ),
        ToolDefinition(
            name="search_products",
            signature="search_products(query: str) -> list",
            description="Search for products in the catalog by name, category, or attributes",
            examples=[
                "Show me red dresses",
                "Find laptops under $1000",
                "Do you have iPhone 15?",
                "Looking for running shoes",
                "Show new arrivals",
            ],
            keywords=["search", "find", "show", "looking", "browse", "products"],
        ),
        ToolDefinition(
            name="process_return",
            signature="process_return(order_id: str, reason: str) -> dict",
            description="Process product returns, refunds, and exchanges",
            examples=[
                "I want to return this",
                "Get a refund for my order",
                "Wrong item received",
                "Item is damaged",
                "Exchange for different size",
            ],
            keywords=["return", "refund", "exchange", "damaged", "wrong", "defective"],
        ),
    ]

    print(f"✅ Defined {len(tools)} tools:")
    for tool in tools:
        print(f"   - {tool.name}")

    # =========================================================================
    # Step 2: Generate Synthetic Training Data
    # =========================================================================
    print("\n[Step 2/6] Generating synthetic training data...")
    print("   Target: ~5000 samples with pattern groups")

    generator = SyntheticDataGenerator(method="rule_based")

    # Generate data with pattern groups
    # num_variations controls pattern diversity
    # num_samples is total target (actual may vary based on patterns)
    data = generator.generate(
        tools=tools,
        num_variations=50,  # 50 variations per pattern
        num_samples=5000,   # Target ~5000 samples
    )

    print(f"✅ Generated {len(data)} training samples")

    # Show sample data
    print(f"\n   Sample data:")
    for i, sample in enumerate(data[:3], 1):
        print(f"   {i}. '{sample['query']}' → {sample['tool']}")

    # =========================================================================
    # Step 3: Split Data (Pattern Group Anti-Leakage)
    # =========================================================================
    print("\n[Step 3/6] Splitting data with pattern group anti-leakage...")

    splitter = PatternGroupSplitter(seed=42)
    train_data, val_data, test_data = splitter.split(
        data,
        train_ratio=0.7,
        val_ratio=0.15,
        test_ratio=0.15,
        verify_no_leakage=True,  # Automatic leakage check
    )

    print(f"\n✅ Data split complete:")
    print(f"   Training:   {len(train_data)} samples ({len(train_data)/len(data)*100:.1f}%)")
    print(f"   Validation: {len(val_data)} samples ({len(val_data)/len(data)*100:.1f}%)")
    print(f"   Test:       {len(test_data)} samples ({len(test_data)/len(data)*100:.1f}%)")

    # =========================================================================
    # Step 4: Validate Data (Optional but Recommended)
    # =========================================================================
    print("\n[Step 4/6] Validating data quality...")

    validator = DataValidator()

    # Validate data quality
    print("   Checking data quality...")
    report = validator.validate(train_data, min_samples_per_tool=100)
    if not report['is_valid']:
        print("   ❌ Data validation failed:")
        for error in report['errors']:
            print(f"      - {error}")
        raise ValueError("Data validation failed!")
    print("   ✅ Data quality is good")

    # Check for data leakage (already done by splitter, but good to verify)
    print("   Double-checking for data leakage...")
    no_leakage = validator.check_leakage(train_data, test_data)
    if not no_leakage:
        raise ValueError("❌ Data leakage detected!")

    # =========================================================================
    # Step 5: Train Model
    # =========================================================================
    print("\n[Step 5/6] Training routing model...")
    print("   This may take 10-20 minutes depending on your hardware...")

    router = FuncRoute()

    router.train(
        train_data=train_data,
        val_data=val_data,
        tools=tools,  # CRITICAL: Must provide tools!
        config=TrainingConfig(
            output_dir="./simple_router",
            num_epochs=3,           # 3 epochs for good accuracy
            batch_size=4,           # Adjust based on GPU memory
            learning_rate=2e-4,     # Standard for fine-tuning
            save_steps=100,         # Save checkpoints every 100 steps
            eval_strategy="epoch",  # Evaluate at end of each epoch
            logging_steps=10,       # Log every 10 steps
            warmup_ratio=0.1,       # 10% warmup
            save_total_limit=2,     # Keep only 2 checkpoints
        ),
    )

    print("\n✅ Training complete!")
    print(f"   Model saved to: ./simple_router")

    # =========================================================================
    # Step 6: Test Trained Model
    # =========================================================================
    print("\n[Step 6/6] Testing trained model...")

    test_queries = [
        "Where is my package?",
        "Show me laptops under $800",
        "I want my money back",
        "Track order #12345",
        "Find wireless keyboards",
        "Return defective item",
        "When will my order arrive?",
        "Do you have iPhone cases?",
        "Exchange for different color",
        "Search for running shoes",
    ]

    print(f"\n📊 Testing with {len(test_queries)} queries:\n")
    print(f"{'Query':<40s} {'Tool':<20s} {'Confidence':>12s}")
    print("-" * 80)

    correct = 0
    expected_tools = {
        "Where is my package?": "manage_order",
        "Show me laptops under $800": "search_products",
        "I want my money back": "process_return",
        "Track order #12345": "manage_order",
        "Find wireless keyboards": "search_products",
        "Return defective item": "process_return",
        "When will my order arrive?": "manage_order",
        "Do you have iPhone cases?": "search_products",
        "Exchange for different color": "process_return",
        "Search for running shoes": "search_products",
    }

    for query in test_queries:
        result = router.route(query)
        expected = expected_tools.get(query, "unknown")
        is_correct = result.tool == expected

        if is_correct:
            correct += 1
            marker = "✅"
        else:
            marker = "❌"

        print(f"{query:<40s} {result.tool:<20s} {result.confidence:>11.1%} {marker}")

    accuracy = correct / len(test_queries) * 100
    print("-" * 80)
    print(f"\n📈 Accuracy: {correct}/{len(test_queries)} ({accuracy:.1f}%)")

    # =========================================================================
    # Bonus: Load Model and Re-test
    # =========================================================================
    print("\n" + "=" * 80)
    print("Bonus: Loading Saved Model")
    print("=" * 80)

    print("\nLoading model from disk...")
    loaded_router = FuncRoute.load("./simple_router")
    print("✅ Model loaded successfully!")

    print("\nTesting loaded model with new queries...")

    new_queries = [
        "Check my order status",
        "Looking for blue jeans",
        "I want a refund",
    ]

    print(f"\n{'Query':<40s} {'Tool':<20s} {'Confidence':>12s}")
    print("-" * 80)

    for query in new_queries:
        result = loaded_router.route(query)
        print(f"{query:<40s} {result.tool:<20s} {result.confidence:>11.1%}")

    # =========================================================================
    # Summary
    # =========================================================================
    print("\n" + "=" * 80)
    print("✅ Example Complete!")
    print("=" * 80)

    print(f"\nWhat we did:")
    print(f"  1. ✅ Defined 3 tools with proper metadata")
    print(f"  2. ✅ Generated {len(data)} synthetic samples with pattern groups")
    print(f"  3. ✅ Split data with anti-leakage (70/15/15)")
    print(f"  4. ✅ Validated data quality and checked for leakage")
    print(f"  5. ✅ Trained model for 3 epochs")
    print(f"  6. ✅ Saved model to ./simple_router")
    print(f"  7. ✅ Loaded model and tested predictions")

    print(f"\nResults:")
    print(f"  📊 Test Accuracy: {accuracy:.1f}%")
    print(f"  💾 Model Location: ./simple_router")
    print(f"  📈 Training Samples: {len(train_data)}")
    print(f"  📉 Validation Samples: {len(val_data)}")
    print(f"  🧪 Test Samples: {len(test_data)}")

    print(f"\nNext steps:")
    print(f"  - Load model: FuncRoute.load('./simple_router')")
    print(f"  - Make predictions: router.route('your query')")
    print(f"  - Deploy as API: See examples/server_example.py")
    print(f"  - Use CLI: funcroute predict --model ./simple_router --query 'test'")

    print("\n" + "=" * 80)

In [5]:
main()

FuncRoute Simple Example - Complete Workflow

[Step 1/6] Defining tools...
✅ Defined 3 tools:
   - manage_order
   - search_products
   - process_return

[Step 2/6] Generating synthetic training data...
   Target: ~5000 samples with pattern groups

RULE-BASED SYNTHETIC DATA GENERATION
Method: Rule-based pattern expansion
Variations per example: 50
Tools: 3


Processing tools: 100%|██████████| 3/3 [00:00<00:00, 483.55it/s]


✅ Generated 15 pattern groups
✅ Total samples: 750
✅ Generated 750 training samples

   Sample data:
   1. 'Get a refund for my order' → process_return
   2. 'Excuse me, get a refund for my order?' → process_return
   3. 'This is urgent - Please get a refund for my order pls' → process_return

[Step 3/6] Splitting data with pattern group anti-leakage...

Pattern Group Splitting:
  Total groups: 15
  Total samples: 750
  Train groups: 10
  Val groups: 2
  Test groups: 3

Expanded to samples:
  Train: 500
  Val: 100
  Test: 150

DATA LEAKAGE CHECK
Train-Val overlap: 0 queries
Train-Test overlap: 0 queries
Val-Test overlap: 0 queries
✅ NO DATA LEAKAGE - Splits are clean!

✅ Data split complete:
   Training:   500 samples (66.7%)
   Validation: 100 samples (13.3%)
   Test:       150 samples (20.0%)

[Step 4/6] Validating data quality...
   Checking data quality...
   ✅ Data quality is good
   Double-checking for data leakage...

✅ NO DATA LEAKAGE
   Train: 500 unique queries
   Test: 150 


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

✅ Tokenizer loaded
Loading model (this may take 1-2 minutes)...


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

✅ Model loaded: 0.22B parameters
Setting up LoRA...
trainable params: 1,474,560 || all params: 269,572,736 || trainable%: 0.5470
✅ LoRA adapters applied

[5/6] Training...

TRAINING CONFIGURATION
Epochs: 3
Batch size: 4
Gradient accumulation: 4
Effective batch size: 16
Learning rate: 0.0002
Output dir: ./simple_router

Initializing SFTTrainer...
Using 'processing_class' parameter (trl >= 0.8.0)


Adding EOS to train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 1}.


✅ Trainer initialized

STARTING TRAINING


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.6002,0.475294,0.444778,62442.0,0.92661
2,0.2147,0.407035,0.31045,124884.0,0.936963
3,0.1778,0.388875,0.29942,187326.0,0.936311


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



✅ TRAINING COMPLETE!
Training time: 4.1 minutes

[6/6] Saving model...

Saving model to: ./simple_router


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model saved
✅ Tool definitions saved to: simple_router/tool_definitions.json

✅ TRAINING COMPLETE
Model saved to: ./simple_router

To use the model:
  router = FuncRoute.load("./simple_router")
  result = router.route("your query")

✅ Training complete!
   Model saved to: ./simple_router

[Step 6/6] Testing trained model...

📊 Testing with 10 queries:

Query                                    Tool                   Confidence
--------------------------------------------------------------------------------
Where is my package?                     manage_order              100.0% ✅
Show me laptops under $800               search_products           100.0% ✅
I want my money back                     process_return            100.0% ✅
Track order #12345                       manage_order              100.0% ✅
Find wireless keyboards                  search_products           100.0% ✅
Return defective item                    process_return            100.0% ✅
When will my order arrive?     