# NSAI Model Validation: Level 0 vs Level 1A vs Level 1B

**Purpose**: Compare all three architectural approaches on unseen test data

**Models**:
- **Level 0**: Baseline TF-IDF + Logistic Regression (confidence thresholding)
- **Level 1A**: Single multi-class classifier + symbolic rules
- **Level 1B**: Multi-detector architecture + explicit rule types

**Test Data**: `test_data.csv` (20 examples, 5 per intent, NOT in training set)


## Part 1: Load All Three Models

In [None]:
# Imports
import pandas as pd
import sys
import os
import json

# Add workspace-relative paths for modules (robustly find repo root)
def find_repo_root(start_dir=None):
    d = start_dir or os.getcwd()
    while True:
        if os.path.exists(os.path.join(d, 'requirements.txt')) or os.path.exists(os.path.join(d, '.git')):
            return d
        parent = os.path.dirname(d)
        if parent == d:
            return os.getcwd()
        d = parent
repo_root = find_repo_root()
# Add repository root so top-level packages like 'level0' and 'level1' import correctly
sys.path.insert(0, repo_root)

from level0.level0_model import Level0Classifier
from level1.level1b_model import Level1BClassifier
# Import Level1 utilities (pipeline + rule functions) for fallback training
from level1.level1_model import create_level1_pipeline, extract_signals, apply_decision_rules

# Import Level2 deterministic pipeline pieces
from level2 import clause_extractor, normalizer, validator
try:
    from level2.llm_adapter import adapter as llm_adapter
except Exception:
    llm_adapter = None

# Lightweight L2 pipeline helper used only for validation reporting
def l2_pipeline(utterance: str, intent: str, use_adapter: bool = False) -> dict:
    adapter_result = {}
    if use_adapter and llm_adapter is not None:
        adapter_result = llm_adapter.extract_clauses(utterance) or {}
    clauses, detectors = clause_extractor.extract_candidates(utterance, use_adapter=use_adapter, adapter_result=adapter_result)
    normalized = normalizer.normalize_clauses(clauses)
    decision_state, details, ambiguity = validator.validate(normalized, intent)
    decision_trace = {
        'detectors_fired': detectors,
        'hard_rules_failed': details.get('hard_rules_failed', []),
        'soft_rules_passed': details.get('soft_rules_passed', []),
        'alternatives_eliminated': {},
    }
    return {
        'clauses': normalized,
        'decision_state': decision_state,
        'ambiguity': ambiguity,
        'decision_trace': decision_trace,
        'metadata': {'adapter_used': bool(use_adapter)},
    }

print('✓ Imports successful')

✓ Imports successful


In [18]:
# Load Level 0 and Level 1B models; provide Level 1A fallback if model missing
level0 = Level0Classifier.load('../level0/models')

# Level 1B (multi-detector) - load from saved detectors
level1b = Level1BClassifier.load('../level1/models/level1b')

# Level 1A (multi-class) - try to load saved model, otherwise train a local pipeline fallback
level1a_model_dir = '../level1/models/level1a'
level1a = None

if os.path.isdir(level1a_model_dir):
    try:
        # If there's a saved Level1Classifier class, attempt to load it
        from level1.level1_model import Level1Classifier
        level1a = Level1Classifier.load(level1a_model_dir)
        print('✓ Loaded Level1A from models')
    except Exception:
        level1a = None

if level1a is None:
    print('Level1A model not found; training a local Level1A pipeline fallback (on full dataset)')
    # Train a quick Level1 pipeline using the shared dataset
    df_all = pd.read_csv('../data/intents_base.csv')
    df_all['intent'] = df_all['intent'].str.lower().str.strip()
    X_all = df_all['utterance']
    y_all = df_all['intent']

    level1_pipeline = create_level1_pipeline()
    level1_pipeline.fit(X_all, y_all)

    # Wrapper to provide .predict(text) interface matching notebook expectations
    class Level1AWrapper:
        def __init__(self, pipeline):
            self.pipeline = pipeline
        def predict(self, text):
            signals = extract_signals(text, self.pipeline)
            rules_out = apply_decision_rules(signals)
            return {
                'predicted_intent': rules_out['predicted_intent'],
                'decision_state': rules_out['decision_state'],
                'decision_reason': rules_out['decision_reason'],
                'max_confidence': signals['max_confidence'],
                'margin': signals['margin'],
                'meaningful_tokens': signals['meaningful_tokens'],
                'probabilities': signals['probabilities'],
                'triggered_rules': rules_out['triggered_rules']
            }

    level1a = Level1AWrapper(level1_pipeline)
    print('✓ Level1A fallback pipeline trained')

print('\n✓ All models ready (Level0, Level1A, Level1B)')


✓ Loaded Level 0 model from ..\level0\models
✓ Loaded Level 1B model from ..\level1\models\level1b
  Detectors: ['investigate', 'execution', 'summarization', 'out_of_scope']
Level1A model not found; training a local Level1A pipeline fallback (on full dataset)
✓ Level1A fallback pipeline trained

✓ All models ready (Level0, Level1A, Level1B)


## Part 2: Load Test Data (Not in Training Set)

In [19]:
# Load test data (utterance only)
test_df = pd.read_csv('test_data.csv')

print(f'Test data loaded: {len(test_df)} examples')
print(f"\nSample data:")
print(test_df.head(10))


Test data loaded: 20 examples

Sample data:
                                           utterance
0    what's causing the spike in CPU usage on web-01
1  can you check why memory is so high on prod se...
2  find out what made the service crash this morning
3           look into the sudden increase in latency
4    diagnose why the app is running slow on staging
5                         reboot the web server farm
6              scale our API tier up to 10 instances
7           terminate the frozen process on worker-5
8                  push the hotfix to production now
9          clean up old logs from all database nodes


In [None]:
# Run predictions from all three models
results = []

for idx, row in test_df.iterrows():
    utterance = row['utterance']
    
    # Level 0
    l0_result = level0.predict(utterance, return_probabilities=True)
    
    # Level 1A  
    l1a_result = level1a.predict(utterance)
    
    # Level 1B
    l1b_result = level1b.predict(utterance)
    
    # Level 2 (deterministic pipeline) - use L1B intent as context when available
    try:
        l2_intent = l1b_result.get('predicted_intent') or l0_result.get('intent')
    except Exception:
        l2_intent = l0_result.get('intent')
    l2_result = l2_pipeline(utterance, intent=l2_intent, use_adapter=False)
    
    # Compile results
    results.append({
        'utterance': utterance,
        
        # Level 0
        'l0_intent': l0_result['intent'],
        'l0_confidence': l0_result['confidence'],
        'l0_abstained': l0_result['abstained'],
        
        # Level 1A
        'l1a_intent': l1a_result['predicted_intent'],
        'l1a_decision_state': l1a_result['decision_state'],
        'l1a_decision_reason': l1a_result['decision_reason'],
        'l1a_confidence': l1a_result['max_confidence'],
        
        # Level 1B
        'l1b_intent': l1b_result['predicted_intent'],
        'l1b_decision_state': l1b_result['decision_state'],
        'l1b_decision_reason': l1b_result['decision_reason'],
        'l1b_top_score': l1b_result['detector_scores'][l1b_result['top_detector']],
        'l1b_score_margin': l1b_result['score_margin'],
        'l1b_decision_trace': json.dumps(l1b_result.get('decision_trace', {})),
        
        # Level 2
        'l2_decision_state': l2_result.get('decision_state'),
        'l2_first_pass': 'success' if l2_result.get('decision_state') == 'accepted' else ('partial' if l2_result.get('decision_state') == 'needs_clarification' else 'failure'),
        'l2_ambiguity': json.dumps(l2_result.get('ambiguity', {})),
        'l2_clauses': json.dumps(l2_result.get('clauses', {})),
        'l2_decision_trace': json.dumps(l2_result.get('decision_trace', {}))
    })

results_df = pd.DataFrame(results)
print(f'✓ Predictions complete: {len(results_df)} examples')

✓ Predictions complete: 20 examples


## Part 3: Run All Models & Compare

In [21]:
# Display detailed comparison without ground truth
print('='*140)
print('COMPREHENSIVE 3-MODEL COMPARISON (NO GROUND TRUTH)')
print('='*140)

for idx, row in results_df.iterrows():
    print(f"\n{'-'*140}")
    print(f"[{idx+1}] {row['utterance']}")
    print(f"{'-'*140}")
    
    # Level 0
    l0_status = ' (ABSTAINED)' if row['l0_abstained'] else ''
    print(f"LEVEL 0 : {row['l0_intent']:15s} | conf={row['l0_confidence']:.3f}{l0_status}")
    
    # Level 1A
    l1a_status = f" [{row['l1a_decision_state']}] ({row['l1a_decision_reason']})"
    print(f"LEVEL 1A: {row['l1a_intent']:15s} | conf={row['l1a_confidence']:.3f}{l1a_status}")
    
    # Level 1B
    l1b_status = f" [{row['l1b_decision_state']}] ({row['l1b_decision_reason']})"
    print(f"LEVEL 1B: {row['l1b_intent']:15s} | score={row['l1b_top_score']:.3f}, margin={row['l1b_score_margin']:.3f}{l1b_status}")

print(f"\n{'='*140}")
print('COMPARISON COMPLETE')
print('='*140)


COMPREHENSIVE 3-MODEL COMPARISON (NO GROUND TRUTH)

--------------------------------------------------------------------------------------------------------------------------------------------
[1] what's causing the spike in CPU usage on web-01
--------------------------------------------------------------------------------------------------------------------------------------------
LEVEL 0 : investigate     | conf=0.789
LEVEL 1A: investigate     | conf=0.608 [accepted] (model_prediction)
LEVEL 1B: investigate     | score=0.516, margin=0.327 [accepted] (R_DEFAULT)

--------------------------------------------------------------------------------------------------------------------------------------------
[2] can you check why memory is so high on prod server
--------------------------------------------------------------------------------------------------------------------------------------------
LEVEL 0 : investigate     | conf=0.815
LEVEL 1A: investigate     | conf=0.757 [accepted] (m

In [22]:
# Summary counts (no accuracy since no ground truth)
l0_abstain_count = sum(results_df['l0_abstained'])
l1a_blocked = sum(results_df['l1a_decision_state'] == 'blocked')
l1a_clarify = sum(results_df['l1a_decision_state'] == 'needs_clarification')
l1b_blocked = sum(results_df['l1b_decision_state'] == 'blocked')
l1b_clarify = sum(results_df['l1b_decision_state'] == 'needs_clarification')

total = len(results_df)

print('\n' + '='*80)
print('SUMMARY (NO GROUND TRUTH)')
print('='*80)
print(f'\nTotal test examples: {total}')
print()
print(f'Level 0 (Baseline):')
print(f'  Abstained: {l0_abstain_count} ({l0_abstain_count/total*100:.1f}%)')
print()
print(f'Level 1A (Multi-Class + Rules):')
print(f'  Blocked:       {l1a_blocked} ({l1a_blocked/total*100:.1f}%)')
print(f'  Needs Clarif:  {l1a_clarify} ({l1a_clarify/total*100:.1f}%)')
print()
print(f'Level 1B (Multi-Detector + Rule Types):')
print(f'  Blocked:       {l1b_blocked} ({l1b_blocked/total*100:.1f}%)')
print(f'  Needs Clarif:  {l1b_clarify} ({l1b_clarify/total*100:.1f}%)')
print('='*80)



SUMMARY (NO GROUND TRUTH)

Total test examples: 20

Level 0 (Baseline):
  Abstained: 17 (85.0%)

Level 1A (Multi-Class + Rules):
  Blocked:       8 (40.0%)
  Needs Clarif:  10 (50.0%)

Level 1B (Multi-Detector + Rule Types):
  Blocked:       18 (90.0%)
  Needs Clarif:  0 (0.0%)


In [23]:
# Save comparison results
results_df.to_csv('validation_results.csv', index=False)
print('✓ Results saved to validation_results.csv')
print(f"\nColumns: {list(results_df.columns)}")

✓ Results saved to validation_results.csv

Columns: ['utterance', 'l0_intent', 'l0_confidence', 'l0_abstained', 'l1a_intent', 'l1a_decision_state', 'l1a_decision_reason', 'l1a_confidence', 'l1b_intent', 'l1b_decision_state', 'l1b_decision_reason', 'l1b_top_score', 'l1b_score_margin', 'l1b_decision_trace']
