## Setup & Configuration

In [None]:
# Install required dependencies if not already installed
# Uncomment these lines if running for the first time:
# !pip install clickhouse-connect redis datasketch

In [3]:
# Import required libraries
from tools.kato_client import KATOClient
from transformers import AutoTokenizer
from typing import List, Dict, Any, Tuple
import json

# Import ClickHouse and Redis utilities from local copy
# These are portable copies of KATO storage utilities
# from tools.kato_storage import (
#     get_clickhouse_client,
#     get_redis_client,
#     ClickHouseWriter,
#     RedisWriter
# )

print("✓ Imports loaded")

PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


✓ Imports loaded


In [4]:
# ============================================================================
# CONFIGURATION - MODIFY THESE TO MATCH YOUR TRAINING
# ============================================================================

# Hierarchical configuration (MUST match training!)
# chunk_sizes: How many tokens/patterns to process at each level
# max_predictions: How many predictions in each ensemble sent to next level
CHUNK_SIZES = [8, 8, 8, 8]      # [node0, node1, node2, node3]
MAX_PREDICTIONS = [10, 10, 10, 10]  # [node0, node1, node2, node3]

# Tokenizer (must match training)
TOKENIZER_NAME = "gpt2"  # Options: "gpt2", "bert-base-uncased", "roberta-base", etc.

# Recall threshold (pattern matching strictness)
# Range: 0.0-1.0
#   0.1 = permissive (many matches, lower quality)
#   0.9 = strict (few matches, higher quality)
# Default: 0.6 (balanced)
RECALL_THRESHOLD_DEFAULT = 0.6

# KATO server URL
BASE_URL = "http://kato:8000"

print("✓ Configuration loaded")
print(f"  Chunk sizes: {CHUNK_SIZES}")
print(f"  Max predictions: {MAX_PREDICTIONS}")

✓ Configuration loaded
  Chunk sizes: [8, 8, 8, 8]
  Max predictions: [10, 10, 10, 10]


In [27]:
# ============================================================================
# KATO CLIENT INITIALIZATION - ONE PER NODE
# ============================================================================

# Create separate KATO clients for each hierarchical level
# max_pattern_length=0: Prediction mode (no auto-learning)
# recall_threshold: Pattern matching strictness
# max_predictions: Number of predictions per ensemble (KATO config)
# process_predictions=True: MUST enable predictions (may be disabled from training)

print("Initializing KATO clients...")

# recall_threshold controls pattern matching strictness:
#   High (0.7-0.9): Strict matching, fewer but higher-quality predictions
#   Medium (0.4-0.6): Balanced (default: 0.6)
#   Low (0.1-0.3): Permissive matching, more predictions (useful for novel inputs)
#
# max_predictions controls ensemble size:
#   - KATO returns top N predictions per call
#   - Entire ensemble sent as ONE event to next level
#   - KATO's pattern matching handles missing/extra symbols gracefully
#   - Higher values: more context but slower, potentially noisy
#   - Lower values: faster but may miss important patterns
#
# process_predictions=False:
#   - We set this to False so that every event doesn't trigger prediction processing.
#   - The get_predictions API call will trigger processing when we need it.
#



node0 = KATOClient(
    base_url=BASE_URL,
    # Node identifiers (MUST match your training configuration)
    # Check your MongoDB databases to find the correct node_ids
    # Format: {node_id}_kato databases in MongoDB
    node_id="node0",
    max_pattern_length=0,
    recall_threshold=RECALL_THRESHOLD_DEFAULT,
    max_predictions=MAX_PREDICTIONS[0],
    process_predictions=False,
    timeout=1200
)

print("\n✓ All KATO clients ready for generation")

Initializing KATO clients...

✓ All KATO clients ready for generation


In [28]:
# Get session configuration
node0.get_session_config()

{'max_pattern_length': 0,
 'persistence': 5,
 'recall_threshold': 0.6,
 'indexer_type': 'VI',
 'max_predictions': 10,
 'sort_symbols': True,
 'process_predictions': False,
 'use_token_matching': True,
 'stm_mode': 'CLEAR',
 'rank_sort_algo': 'potential',
 'filter_pipeline': [],
 'length_min_ratio': 0.5,
 'length_max_ratio': 2.0,
 'jaccard_threshold': 0.3,
 'jaccard_min_overlap': 2,
 'minhash_threshold': 0.7,
 'minhash_bands': 20,
 'minhash_rows': 5,
 'minhash_num_hashes': 100,
 'bloom_false_positive_rate': 0.01,
 'max_candidates_per_stage': 100000,
 'enable_filter_metrics': True}

## Filter Pipeline Algos


Performance Comparison (1.2M patterns)

  | Filter Pipeline        | Database Candidates      | Python Candidates | Est. Time | Accuracy |
  |------------------------|--------------------------|-------------------|-----------|----------|
  | [] (empty)             | 1,200,000                | 1,200,000         | TIMEOUT   | 100%     |
  | ["length"]             | ~500,000 (not effective) | ~500,000          | Minutes   | 100%     |
  | ["jaccard"]            | ~10,000-50,000           | ~10,000-50,000    | 5-30s     | 100%     |
  | ["minhash"]            | ~1,000                   | ~1,000            | <1s       | ~98%     |
  | ["minhash", "jaccard"] | ~500                     | ~500              | <1s       | ~98%     |

  ---
  Answer to Your Question

  For your dataset (1.2M patterns, similar lengths):

  1. Most performant: ["minhash"] - designed for billion-scale, handles 1.2M easily
  2. Most accurate: ["jaccard", "rapidfuzz"] - exact matching, reasonable speed
  3. Best balance: ["minhash"] with minhash_threshold: 0.5 - fast + accurate enough

  Try Option 1 (MinHash only) first. It's specifically designed for this scenario.

  Recommended Configuration

  For your 1.2M pattern dataset with similar lengths:


  {
      "filter_pipeline": ["minhash"],
      "minhash_threshold": 0.5,      # Lower = more recall (try 0.4-0.6)
      "recall_threshold": 0.3,       # Final similarity threshold
      "enable_filter_metrics": True
  }


In [29]:
# Note: rank_sort_algo may not be available as a session config parameter
# Available session configs: max_pattern_length, persistence, recall_threshold, 
#                           stm_mode, indexer_type, max_predictions, sort_symbols,
#                           process_predictions, use_token_matching

node0.update_session_config({
      'use_token_matching': True,  # False = Character-level mode, True = Token-level mode
      'filter_pipeline': ['jaccard'],
      'jaccard_threshold': 0.3,      # Token set overlap
      'jaccard_min_overlap': 2,      # At least 2 shared tokens
      # 'minhash_threshold': 0.1,      # Lower = more recall (try 0.4-0.6)
      'recall_threshold': 0.3,       # Sequence similarity
      'max_predictions': 10,
      'max_candidates_per_stage': 1000,
})


{'status': 'okay',
 'message': 'Configuration updated',
 'session_id': 'session-f1bf1691a1284ea9a8da5e8bf1fbbc4b-1770403673262'}

In [30]:
node0.check_session_exists()

{'exists': True,
 'expired': False,
 'session_id': 'session-f1bf1691a1284ea9a8da5e8bf1fbbc4b-1770403673262'}

In [31]:
node0.get_session_info()

{'session_id': 'session-f1bf1691a1284ea9a8da5e8bf1fbbc4b-1770403673262',
 'node_id': 'node0',
 'created_at': '2026-02-06T18:47:53.262672Z',
 'expires_at': '2026-02-06T19:48:02.809329Z',
 'ttl_seconds': 3599,
 'metadata': {},
 'session_config': {'max_pattern_length': 0,
  'persistence': 5,
  'recall_threshold': 0.3,
  'indexer_type': 'VI',
  'max_predictions': 10,
  'sort_symbols': True,
  'process_predictions': False,
  'use_token_matching': True,
  'stm_mode': 'CLEAR',
  'rank_sort_algo': 'potential',
  'filter_pipeline': ['jaccard'],
  'length_min_ratio': 0.5,
  'length_max_ratio': 2.0,
  'jaccard_threshold': 0.3,
  'jaccard_min_overlap': 2,
  'minhash_threshold': 0.7,
  'minhash_bands': 20,
  'minhash_rows': 5,
  'minhash_num_hashes': 100,
  'bloom_false_positive_rate': 0.01,
  'max_candidates_per_stage': 1000,
  'enable_filter_metrics': True}}

In [33]:
node0.clear_stm()

{'status': 'cleared',
 'session_id': 'session-f1bf1691a1284ea9a8da5e8bf1fbbc4b-1770403673262'}

In [34]:
node0.get_stm()

{'stm': [],
 'session_id': 'session-f1bf1691a1284ea9a8da5e8bf1fbbc4b-1770403673262',
 'length': 0}

In [39]:

## Test patterns known to be in KB
# Chunks (12) with chunk_size=8:
#   Chunk 0: ['ĠAmong', 'Ġfl', 'ukes', ',', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġin']  PTRN|6850d8ef6abf023e778693c4d5d9986db464e5cd
#   Chunk 1: ['ĠNorth', 'ĠAmerican', 'Ġwolves', 'Ġis', 'ĠAl', 'aria', ',', 'Ġwhich'] PTRN|b9f48713626a92cd5c533ea1bafea4c1ac50bbf2
#   Chunk 2: ['Ġinfect', 's', 'Ġsmall', 'Ġrodents', 'Ġand', 'Ġamphib', 'ians', 'Ġthat'] PTRN|7729f0ed56a13a9373fc1b1c17e34f61d4512ab4
#   Chunk 3: ['Ġare', 'Ġeaten', 'Ġby', 'Ġwolves', '.', 'ĠUpon', 'Ġreaching', 'Ġmaturity'] PTRN|017e91585ea0db850a387c8756f67da5c1510254
#   Chunk 4: [',', 'ĠAl', 'aria', 'Ġmigr', 'ates', 'Ġto', 'Ġthe', 'Ġwolf']
#   Chunk 5: ["'s", 'Ġintestine', ',', 'Ġbut', 'Ġharms', 'Ġit', 'Ġlittle', '.']
#   Chunk 6: ['ĠMet', 'or', 'ch', 'is', 'Ġconj', 'unct', 'us', ',']
#   Chunk 7: ['Ġwhich', 'Ġenters', 'Ġwolves', 'Ġthrough', 'Ġeating', 'Ġfish', ',', 'Ġinfect']
#   Chunk 8: ['s', 'Ġthe', 'Ġwolf', "'s", 'Ġliver', 'Ġor', 'Ġgall', 'Ġbladder']
#   Chunk 9: [',', 'Ġcausing', 'Ġliver', 'Ġdisease', ',', 'Ġinflammation', 'Ġof', 'Ġthe']
#   Chunk 10: ['Ġpanc', 're', 'as', ',', 'Ġand', 'Ġem', 'ac', 'iation']
#   Chunk 11: ['.', 'ĠMost', 'Ġother', 'Ġfl', 'uke', 'Ġspecies']

chunk = ['ĠAmong', 'Ġfl', 'ukes', 'Ġ,', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġin'] 
# chunk = ['ĠNorth', 'ĠAmerican', 'Ġwolves', 'Ġis', 'ĠAl', 'aria', ',', 'Ġwhich']
# chunk = ['Ġinfect', 's', 'Ġsmall', 'Ġrodents', 'Ġand', 'Ġamphib', 'ians', 'Ġthat']

# chunk = ["Ġleague","Ġminimum","Ġ,","Ġpasing","Ġupt","Ġa","Ġsuperior","Ġcontract"]

# # Clear STM and observe your test pattern
node0.clear_stm()
node0.observe(strings=[chunk[0]])
node0.observe(strings=[chunk[1]])
node0.observe(strings=[chunk[2]])
node0.observe(strings=[chunk[3]])
node0.observe(strings=[chunk[4]])
node0.observe(strings=[chunk[5]])
node0.observe(strings=[chunk[6]])
# node0.observe(strings=[chunk[7]])


# Verify STM
stm = node0.get_stm()
print(f"STM: {stm['stm']}")


STM: [['ĠAmong'], ['Ġfl'], ['ukes'], ['Ġ,'], ['Ġthe'], ['Ġmost'], ['Ġcommon']]


In [40]:
predictions = node0.get_predictions()
preds = predictions['predictions']
print(f"There are {len(preds)} predictions")

total_potential = sum([p['potential'] for p in preds])
for p in preds:
    print("="*10)
    print(f"PTRN|{p['name']}")
    print(f"matches: {p['matches']}")
    print(f"anomalies: - {p['missing']}  + {p['extras']}")
    print(f"past: {p['past']}")
    print(f"present: {p['present']}")
    print(f"future: {p['future']}")
    print(f'''metrics:
    frequency: {p['frequency']}
    potential: {p['potential']}
    normalized_potential: {p['potential']/total_potential}
    similarity: {p['similarity']} 
    confidence: {p['confidence']} 
    evidence: {p['evidence']} 
    predictive_information: {p['predictive_information']}
    itfdf_similarity: {p['itfdf_similarity']}
    tfidf_similarity: {p['tfidf_score']}
    pattern_probability: {p['pattern_probability']}
    weighted_strength: {p['weighted_strength']}
    fragmentation: {p['fragmentation']}
    snr: {p['snr']}
    entropy: {p['entropy']}
    normalized_entropy: {p['normalized_entropy']}
    global_normalized_entropy: {p['global_normalized_entropy']}
    bayesian_posterior: {p['bayesian_posterior']}
    bayesian_prior: {p['bayesian_prior']}
    bayesian_likelihood: {p['bayesian_likelihood']}
    
    ''')

There are 10 predictions
PTRN|6850d8ef6abf023e778693c4d5d9986db464e5cd
matches: ['ĠAmong', 'Ġfl', 'ukes', 'Ġ,', 'Ġthe', 'Ġmost', 'Ġcommon']
anomalies: - [[], [], [], [], [], [], []]  + [[], [], [], [], [], [], []]
past: []
present: [['ĠAmong'], ['Ġfl'], ['ukes'], ['Ġ,'], ['Ġthe'], ['Ġmost'], ['Ġcommon']]
future: [['Ġin']]
metrics:
    frequency: 1
    potential: 3.875
    normalized_potential: 0.14696516123420197
    similarity: 0.9333333333333333 
    confidence: 1.0 
    evidence: 0.875 
    predictive_information: 0.28444444444444444
    itfdf_similarity: 1.0
    tfidf_similarity: 1.3280445895742101
    pattern_probability: 0.06666666666666667
    weighted_strength: 0.06222222222222222
    fragmentation: 0.0
    snr: 1.0
    entropy: 2.807354922057604
    normalized_entropy: 0.18308131158888039
    global_normalized_entropy: 0.07292945147433891
    bayesian_posterior: 0.12280701754385963
    bayesian_prior: 0.06666666666666667
    bayesian_likelihood: 0.9333333333333333
    
    
PT

In [41]:
predictions

{'predictions': [{'type': 'prototypical',
   'name': '6850d8ef6abf023e778693c4d5d9986db464e5cd',
   'frequency': 1,
   'emotives': {},
   'matches': ['ĠAmong', 'Ġfl', 'ukes', 'Ġ,', 'Ġthe', 'Ġmost', 'Ġcommon'],
   'past': [],
   'present': [['ĠAmong'],
    ['Ġfl'],
    ['ukes'],
    ['Ġ,'],
    ['Ġthe'],
    ['Ġmost'],
    ['Ġcommon']],
   'missing': [[], [], [], [], [], [], []],
   'extras': [[], [], [], [], [], [], []],
   'anomalies': [],
   'potential': 3.875,
   'evidence': 0.875,
   'similarity': 0.9333333333333333,
   'fragmentation': 0.0,
   'snr': 1.0,
   'confluence': 1.2433388123130329e-06,
   'predictive_information': 0.28444444444444444,
   'sequence': [['ĠAmong'],
    ['Ġfl'],
    ['ukes'],
    ['Ġ,'],
    ['Ġthe'],
    ['Ġmost'],
    ['Ġcommon'],
    ['Ġin']],
   'future': [['Ġin']],
   'confidence': 1.0,
   'entropy': 2.807354922057604,
   'normalized_entropy': 0.18308131158888039,
   'global_normalized_entropy': 0.07292945147433891,
   'itfdf_similarity': 1.0,
   'tfidf

In [12]:
node0.get_pattern("PTRN|fd54bd662706befc7a4b6588f8a2e8903682a451")

{'pattern': {'status': 'okay',
  'pattern': {'name': 'PTRN|fd54bd662706befc7a4b6588f8a2e8903682a451',
   'pattern_data': [['Ġvocals'],
    ['Ġ,'],
    ['ĠEpic'],
    ['l'],
    ['oud'],
    ['Ġappeared'],
    ['Ġon'],
    ['Ġseveral']],
   'length': 8,
   'frequency': 1,
   'emotives': [],
   'metadata': {}}},
 'node_id': 'node0_kato'}

In [13]:
node0.get_pattern("PTRN|6850d8ef6abf023e778693c4d5d9986db464e5cd")

{'pattern': {'status': 'okay',
  'pattern': {'name': 'PTRN|6850d8ef6abf023e778693c4d5d9986db464e5cd',
   'pattern_data': [['ĠAmong'],
    ['Ġfl'],
    ['ukes'],
    ['Ġ,'],
    ['Ġthe'],
    ['Ġmost'],
    ['Ġcommon'],
    ['Ġin']],
   'length': 8,
   'frequency': 1,
   'emotives': [],
   'metadata': {}}},
 'node_id': 'node0_kato'}

In [14]:
node0.get_pattern("PTRN|acf6699f0326787929d6b615d7c744571ca0cff4")

{'pattern': {'status': 'error',
  'message': 'Pattern PTRN|acf6699f0326787929d6b615d7c744571ca0cff4 not found'},
 'node_id': 'node0_kato'}

## Test other predictions

In [15]:
chunk = ['Machine', 'Ġlearning', 'Ġis', 'Ġa', 'Ġfield', 'Ġof', 'Ġartificial', 'Ġintelligence']
# # Clear STM and observe your test pattern
node0.clear_stm()
node0.observe(strings=[chunk[0]])
node0.observe(strings=[chunk[1]])
node0.observe(strings=[chunk[2]])
node0.observe(strings=[chunk[3]])
node0.observe(strings=[chunk[4]])
node0.observe(strings=[chunk[5]])
node0.observe(strings=[chunk[6]])
node0.observe(strings=[chunk[7]])

node0.get_stm()

{'stm': [['Machine'],
  ['Ġlearning'],
  ['Ġis'],
  ['Ġa'],
  ['Ġfield'],
  ['Ġof'],
  ['Ġartificial'],
  ['Ġintelligence']],
 'session_id': 'session-c1ad243e3c204967a8c472d1d438a855-1770309748783',
 'length': 8}

In [16]:
node0.get_predictions() #['predictions']

{'predictions': [{'type': 'prototypical',
   'name': 'a6dd9b5cbba7b644cfc671af13d5d8f2f835d80f',
   'frequency': 1,
   'emotives': {},
   'matches': ['Ġis', 'Ġa', 'Ġfield', 'Ġof'],
   'past': [],
   'present': [['Ġis'], ['Ġa'], ['Ġfield'], ['Ġof']],
   'missing': [[], [], [], []],
   'extras': [['Machine'],
    ['Ġlearning'],
    [],
    [],
    [],
    [],
    ['Ġartificial'],
    ['Ġintelligence']],
   'anomalies': [],
   'potential': 2.4999995705227795,
   'evidence': 0.5,
   'similarity': 0.5,
   'fragmentation': 0.0,
   'snr': 0.3333333333333333,
   'confluence': 1.2433375784310582e-06,
   'predictive_information': 0.5,
   'sequence': [['Ġis'],
    ['Ġa'],
    ['Ġfield'],
    ['Ġof'],
    ['Ġcharacteristic'],
    ['Ġp'],
    ['Ġ,'],
    ['Ġand']],
   'future': [['Ġcharacteristic'], ['Ġp'], ['Ġ,'], ['Ġand']],
   'confidence': 1.0,
   'entropy': 2.0,
   'normalized_entropy': 0.19564463703936721,
   'global_normalized_entropy': 0.06486950548220455,
   'itfdf_similarity': 0.9999995705

## Cleanup

Close all connections when done.

In [None]:
# Close KATO client connections
for i, node in enumerate(nodes):
    node.close()
    print(f"✓ Closed node{i}")


print("\n✓ All connections closed")