# Storage System Comprehensive Tests

This notebook tests the new hybrid storage architecture (SQLite + JSONL + NumPy).

Test coverage:
1. Database initialization
2. Node creation from existing data
3. CRUD operations
4. JSONL and NumPy file operations
5. Graph edge operations
6. Usage statistics updates
7. Data consistency verification


In [None]:
# Setup and imports
import sys
import os
import json
import numpy as np
from pathlib import Path
from datetime import datetime

# Import memory modules
from memory import (
    init_db,
    create_node,
    get_node,
    update_usage,
    add_graph_edge,
    load_sentence,
    load_embedding,
    load_edge_arrays,
    count_sentences,
    get_embedding_shape,
    NodesContext,
    Neighbors,
    NodesUsage,
)

print("All imports successful")


All imports successful


## Test 1: Database Initialization


In [2]:
# Initialize database
test_db_path = "test_memory.db"
if os.path.exists(test_db_path):
    os.remove(test_db_path)

db = init_db(test_db_path)
print(f"Database initialized: {test_db_path}")
print(f"Tables created: {db.get_tables()}")

# Verify tables exist
assert "nodes_context" in db.get_tables()
assert "neighbors" in db.get_tables()
assert "nodes_usage" in db.get_tables()
print("✓ All tables created successfully")


Database initialized: test_memory.db
Tables created: ['neighbors', 'nodes_context', 'nodes_usage']
✓ All tables created successfully


## Test 2: Load Existing Data and Create Nodes


In [3]:
# Check if existing data exists
sentences_path = "memory_bacon/core/sentences.jsonl"
embeddings_path = "memory_bacon/core/embeddings.npy"

if not os.path.exists(sentences_path):
    print("⚠ sentences.jsonl not found. Run parse_essay.py first.")
    print("Running parse_essay.py...")
    import subprocess
    result = subprocess.run(["python", "parse_essay.py"], capture_output=True, text=True)
    print(result.stdout)
    if result.returncode != 0:
        print(f"Error: {result.stderr}")

if not os.path.exists(embeddings_path):
    print("⚠ embeddings.npy not found. Run generate_embeddings.py first.")
    print("Note: This requires OpenAI API key. Skipping embedding generation in test.")

# Load existing sentences
if os.path.exists(sentences_path):
    num_sentences = count_sentences(sentences_path)
    print(f"Found {num_sentences} sentences in {sentences_path}")
    
    # Load first 3 sentences as sample
    sample_sentences = []
    for i in range(min(3, num_sentences)):
        sent = load_sentence(sentences_path, i)
        if sent:
            sample_sentences.append(sent)
            print(f"  [{i}] {sent.get('id', 'N/A')}: {sent.get('text', '')[:50]}...")
else:
    print("⚠ No existing sentences found")
    sample_sentences = []


Found 99 sentences in memory_bacon/core/sentences.jsonl
  [0] bacon_001: What is truth? said jesting Pilate, and would not ...
  [1] bacon_002: Certainly there be, that delight in giddiness, and...
  [2] bacon_003: And though the sects of philosophers of that kind ...


In [4]:
# Check embeddings
if os.path.exists(embeddings_path):
    shape = get_embedding_shape(embeddings_path)
    if shape:
        print(f"Embeddings shape: {shape[0]} sentences × {shape[1]} dimensions")
        
        # Load first embedding as sample
        sample_embedding = load_embedding(embeddings_path, 0)
        if sample_embedding is not None:
            print(f"Sample embedding norm: {np.linalg.norm(sample_embedding):.4f}")
    else:
        print("⚠ Could not read embeddings shape")
else:
    print("⚠ embeddings.npy not found")


Embeddings shape: 99 sentences × 1536 dimensions
Sample embedding norm: 1.0000


## Test 3: Create Nodes from Existing Data


In [None]:
# Create nodes from existing data (if available)
# Use direct database operations to avoid modifying original files
created_node_ids = []

if os.path.exists(sentences_path) and os.path.exists(embeddings_path):
    # Load all sentences and embeddings
    sentences = []
    with open(sentences_path, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(json.loads(line))
    
    embeddings = np.load(embeddings_path, mmap_mode='r')
    
    # Create nodes for first 5 sentences (or all if less than 5)
    # Use direct DB operations to point to existing file indices (don't modify files)
    num_to_create = min(5, len(sentences), len(embeddings))
    print(f"Creating {num_to_create} node records from existing data...")
    print("(Using direct DB operations to avoid modifying original files)")
    
    from memory.database import add_node_context, add_node_usage
    from memory.npy_utils import compute_embedding_norm
    
    for i in range(num_to_create):
        sent = sentences[i]
        embedding = embeddings[i]
        node_id = i + 1  # Use 1-based node IDs
        
        # Compute embedding norm
        embedding_norm = compute_embedding_norm(embedding)
        
        # Add node context (pointing to existing file indices)
        add_node_context(
            node_id=node_id,
            sentence_offset=i,  # Index into existing sentences.jsonl
            embedding_index=i,  # Index into existing embeddings.npy
            source='3essay.txt',
            tag=f"chapter_{sent.get('chapter', 1)}",
            language='en',
            initial_context=sent.get('text', '')[:100],
            embedding_norm=embedding_norm,
        )
        
        # Add node usage (default values)
        add_node_usage(node_id=node_id)
        
        created_node_ids.append(node_id)
        print(f"  Created node {node_id}: {sent.get('id', 'N/A')}")
    
    print(f"✓ Created {len(created_node_ids)} node records")
else:
    print("⚠ Skipping node creation (missing data files)")
    # Create a test node with dummy data using create_node (this will create new files)
    print("Creating test node with dummy data (using create_node)...")
    test_sentences_path = "test_sentences.jsonl"
    test_embeddings_path = "test_embeddings.npy"
    
    # Clean up test files if they exist
    if os.path.exists(test_sentences_path):
        os.remove(test_sentences_path)
    if os.path.exists(test_embeddings_path):
        os.remove(test_embeddings_path)
    
    dummy_embedding = np.random.rand(1536).astype(np.float32)
    node_id = create_node(
        sentence_text="Test sentence for storage system.",
        embedding_vector=dummy_embedding,
        metadata_dict={
            'id': 1,
            'source': 'test',
            'tag': 'test',
            'language': 'en',
        },
        sentences_path=test_sentences_path,
        embeddings_path=test_embeddings_path,
        db_path=test_db_path,
    )
    created_node_ids.append(node_id)
    print(f"✓ Created test node {node_id}")
    
    # Update paths for subsequent tests
    sentences_path = test_sentences_path
    embeddings_path = test_embeddings_path


Creating 5 node records from existing data...
(Using direct DB operations to avoid modifying original files)
  Created node 1: bacon_001
  Created node 2: bacon_002
  Created node 3: bacon_003
  Created node 4: bacon_004
  Created node 5: bacon_005
✓ Created 5 node records


## Test 4: Retrieve Nodes


In [6]:
# Test get_node for each created node
print("Testing get_node()...")
for node_id in created_node_ids:
    node = get_node(
        node_id,
        sentences_path=sentences_path,
        embeddings_path=embeddings_path,
        db_path=test_db_path,
    )
    
    if node:
        print(f"\nNode {node_id}:")
        print(f"  Context: id={node['context']['id']}, sentence_offset={node['context']['sentence_offset']}")
        print(f"  Usage: access_count={node['usage']['access_count']}, popularity={node['usage']['popularity']}")
        print(f"  Neighbors: {len(node['neighbors'])} edges")
        if node['sentence']:
            print(f"  Sentence: {node['sentence'].get('text', '')[:60]}...")
        if node['embedding']:
            print(f"  Embedding: {len(node['embedding'])} dimensions")
        
        # Verify data consistency
        assert node['context'] is not None, "Context should not be None"
        assert node['usage'] is not None, "Usage should not be None"
        assert node['sentence'] is not None, "Sentence should not be None"
        assert node['embedding'] is not None, "Embedding should not be None"
        print(f"  ✓ Data consistency verified")
    else:
        print(f"  ✗ Node {node_id} not found")

print("\n✓ All nodes retrieved successfully")


Testing get_node()...

Node 1:
  Context: id=1, sentence_offset=0
  Usage: access_count=0, popularity=0.0
  Neighbors: 0 edges
  Sentence: What is truth? said jesting Pilate, and would not stay for a...
  Embedding: 1536 dimensions
  ✓ Data consistency verified

Node 2:
  Context: id=2, sentence_offset=1
  Usage: access_count=0, popularity=0.0
  Neighbors: 0 edges
  Sentence: Certainly there be, that delight in giddiness, and count it ...
  Embedding: 1536 dimensions
  ✓ Data consistency verified

Node 3:
  Context: id=3, sentence_offset=2
  Usage: access_count=0, popularity=0.0
  Neighbors: 0 edges
  Sentence: And though the sects of philosophers of that kind be gone, y...
  Embedding: 1536 dimensions
  ✓ Data consistency verified

Node 4:
  Context: id=4, sentence_offset=3
  Usage: access_count=0, popularity=0.0
  Neighbors: 0 edges
  Sentence: But it is not only the difficulty and labor, which men take ...
  Embedding: 1536 dimensions
  ✓ Data consistency verified

Node 5:
  Context

## Test 5: Update Usage Statistics


In [7]:
# Test update_usage
print("Testing update_usage()...")
test_node_id = created_node_ids[0]

# Get initial usage
initial_node = get_node(test_node_id, db_path=test_db_path)
initial_access = initial_node['usage']['access_count']
print(f"Initial access_count: {initial_access}")

# Update usage
success = update_usage(
    node_id=test_node_id,
    access_count=initial_access + 1,
    last_access_time=datetime.now(),
    recent_hit_count=3,
    decay_score=0.85,
    popularity=0.92,
    db_path=test_db_path,
)

assert success, "update_usage should return True"
print(f"✓ update_usage returned: {success}")

# Verify update
updated_node = get_node(test_node_id, db_path=test_db_path)
updated_access = updated_node['usage']['access_count']
print(f"Updated access_count: {updated_access}")
assert updated_access == initial_access + 1, "access_count should be incremented"
print("✓ Usage statistics updated correctly")


Testing update_usage()...
Initial access_count: 0
✓ update_usage returned: True
Updated access_count: 1
✓ Usage statistics updated correctly


## Test 6: Graph Edge Operations


In [8]:
# Test add_graph_edge
print("Testing add_graph_edge()...")

if len(created_node_ids) >= 2:
    # Add edges between nodes
    edge_types = {
        0: "adjacent",
        1: "same_chapter",
        2: "question_context",
        3: "definition_context",
    }
    
    # Add a few edges
    edges_added = []
    for i in range(min(3, len(created_node_ids) - 1)):
        u = created_node_ids[i]
        v = created_node_ids[i + 1]
        edge_type = i % 4
        weight = 1.0 - (i * 0.1)
        
        success = add_graph_edge(u, v, weight, edge_type, db_path=test_db_path)
        assert success, f"add_graph_edge should return True for ({u}, {v})"
        edges_added.append((u, v, weight, edge_type))
        print(f"  Added edge: {u} -> {v}, type={edge_types[edge_type]}, weight={weight:.2f}")
    
    print(f"✓ Added {len(edges_added)} edges")
    
    # Verify edges by querying neighbors
    print("\nVerifying edges...")
    for u, v, weight, edge_type in edges_added:
        node = get_node(u, db_path=test_db_path)
        neighbors = node['neighbors']
        
        # Check if edge exists
        found = any(n['v'] == v and n['edge_type'] == edge_type for n in neighbors)
        assert found, f"Edge ({u}, {v}) should exist in neighbors"
        print(f"  ✓ Edge {u} -> {v} verified")
    
    print("✓ All edges verified")
else:
    print("⚠ Need at least 2 nodes to test edges. Skipping.")


Testing add_graph_edge()...
  Added edge: 1 -> 2, type=adjacent, weight=1.00
  Added edge: 2 -> 3, type=same_chapter, weight=0.90
  Added edge: 3 -> 4, type=question_context, weight=0.80
✓ Added 3 edges

Verifying edges...
  ✓ Edge 1 -> 2 verified
  ✓ Edge 2 -> 3 verified
  ✓ Edge 3 -> 4 verified
✓ All edges verified


## Test 7: Load Edge Arrays from NumPy


In [9]:
# Test load_edge_arrays
graph_dir = "memory_bacon/graph"
print(f"Testing load_edge_arrays() from {graph_dir}...")

edge_arrays = load_edge_arrays(graph_dir)
if edge_arrays:
    edge_index, edge_weight, edge_type = edge_arrays
    print(f"  edge_index shape: {edge_index.shape}")
    print(f"  edge_weight shape: {edge_weight.shape}")
    print(f"  edge_type shape: {edge_type.shape}")
    
    # Verify consistency
    assert len(edge_weight) == len(edge_type), "edge_weight and edge_type should have same length"
    assert edge_index.shape[0] == 2, "edge_index should have 2 rows (source, target)"
    assert edge_index.shape[1] == len(edge_weight), "edge_index columns should match edge count"
    
    print(f"  Total edges: {len(edge_weight)}")
    print(f"  Edge types: {np.unique(edge_type)}")
    print("✓ Edge arrays loaded successfully")
else:
    print("⚠ Edge arrays not found. Run convert_edges_to_numpy.py to generate them.")


Testing load_edge_arrays() from memory_bacon/graph...
  edge_index shape: (2, 201)
  edge_weight shape: (201,)
  edge_type shape: (201,)
  Total edges: 201
  Edge types: [0 2 3]
✓ Edge arrays loaded successfully


## Test 8: Database Query Operations


In [10]:
# Test direct database queries
print("Testing direct database queries...")

# Count nodes
node_count = NodesContext.select().count()
print(f"Total nodes in database: {node_count}")
assert node_count == len(created_node_ids), "Node count should match created nodes"

# Count edges
edge_count = Neighbors.select().count()
print(f"Total edges in database: {edge_count}")

# Count usage records
usage_count = NodesUsage.select().count()
print(f"Total usage records: {usage_count}")
assert usage_count == len(created_node_ids), "Usage count should match node count"

# Query nodes with specific criteria
# Use is_null(False) instead of is_not(None) in Peewee
nodes_with_tag = NodesContext.select().where(NodesContext.tag.is_null(False)).count()
print(f"Nodes with tags: {nodes_with_tag}")

# Query nodes with specific tag value
if nodes_with_tag > 0:
    test_tag = NodesContext.select().where(NodesContext.tag.is_null(False)).first()
    if test_tag:
        nodes_with_specific_tag = NodesContext.select().where(NodesContext.tag == test_tag.tag).count()
        print(f"Nodes with tag '{test_tag.tag}': {nodes_with_specific_tag}")

# Query nodes with source
nodes_with_source = NodesContext.select().where(NodesContext.source.is_null(False)).count()
print(f"Nodes with source: {nodes_with_source}")

# Query neighbors for a specific node
if len(created_node_ids) > 0:
    test_node_id = created_node_ids[0]
    neighbor_count = Neighbors.select().where(Neighbors.u == test_node_id).count()
    print(f"Neighbors of node {test_node_id}: {neighbor_count}")
    
    # Query outgoing edges
    outgoing_edges = Neighbors.select().where(Neighbors.u == test_node_id)
    print(f"  Outgoing edges from node {test_node_id}:")
    for edge in outgoing_edges:
        print(f"    -> {edge.v} (weight={edge.weight:.2f}, type={edge.edge_type})")

# Query nodes ordered by creation time
oldest_node = NodesContext.select().order_by(NodesContext.created_at.asc()).first()
newest_node = NodesContext.select().order_by(NodesContext.created_at.desc()).first()
if oldest_node and newest_node:
    print(f"Oldest node: {oldest_node.id} (created: {oldest_node.created_at})")
    print(f"Newest node: {newest_node.id} (created: {newest_node.created_at})")

print("✓ Database queries successful")


Testing direct database queries...
Total nodes in database: 5
Total edges in database: 3
Total usage records: 5
Nodes with tags: 5
Nodes with tag 'chapter_1': 5
Nodes with source: 5
Neighbors of node 1: 1
  Outgoing edges from node 1:
    -> 2 (weight=1.00, type=0)
Oldest node: 1 (created: 2025-11-13 15:18:53)
Newest node: 1 (created: 2025-11-13 15:18:53)
✓ Database queries successful


## Test 9: Data Consistency Verification


In [11]:
# Comprehensive consistency check
print("Running data consistency checks...")

# Ensure database is initialized
init_db(test_db_path)

errors = []

# Check 1: All nodes have usage records
print("  Checking node-usage consistency...")
for node_id in created_node_ids:
    try:
        context = NodesContext.get_by_id(node_id)
        try:
            usage = NodesUsage.get_by_id(node_id)
        except NodesUsage.DoesNotExist:
            errors.append(f"Node {node_id} missing usage record")
    except NodesContext.DoesNotExist:
        errors.append(f"Node {node_id} does not exist in context table")

# Check 2: Sentence offsets are valid
print("  Checking sentence offset validity...")
if os.path.exists(sentences_path):
    max_sentence_count = count_sentences(sentences_path)
    for node_id in created_node_ids:
        try:
            context = NodesContext.get_by_id(node_id)
            if context.sentence_offset < 0:
                errors.append(f"Node {node_id} has negative sentence_offset {context.sentence_offset}")
            elif context.sentence_offset >= max_sentence_count:
                errors.append(f"Node {node_id} has invalid sentence_offset {context.sentence_offset} (max: {max_sentence_count - 1})")
        except NodesContext.DoesNotExist:
            errors.append(f"Node {node_id} does not exist (cannot check sentence_offset)")

# Check 3: Embedding indices are valid
print("  Checking embedding index validity...")
if os.path.exists(embeddings_path):
    shape = get_embedding_shape(embeddings_path)
    if shape:
        max_embedding_count = shape[0]
        for node_id in created_node_ids:
            try:
                context = NodesContext.get_by_id(node_id)
                if context.embedding_index < 0:
                    errors.append(f"Node {node_id} has negative embedding_index {context.embedding_index}")
                elif context.embedding_index >= max_embedding_count:
                    errors.append(f"Node {node_id} has invalid embedding_index {context.embedding_index} (max: {max_embedding_count - 1})")
            except NodesContext.DoesNotExist:
                errors.append(f"Node {node_id} does not exist (cannot check embedding_index)")
    else:
        print("    ⚠ Could not read embedding shape")
else:
    print("    ⚠ Embeddings file not found, skipping embedding index check")

# Check 4: Neighbor references are valid
print("  Checking neighbor reference validity...")
edge_count = 0
for edge in Neighbors.select():
    edge_count += 1
    try:
        NodesContext.get_by_id(edge.u)
    except NodesContext.DoesNotExist:
        errors.append(f"Edge ({edge.u}, {edge.v}) references non-existent source node {edge.u}")
    try:
        NodesContext.get_by_id(edge.v)
    except NodesContext.DoesNotExist:
        errors.append(f"Edge ({edge.u}, {edge.v}) references non-existent target node {edge.v}")

print(f"    Checked {edge_count} edges")

# Check 5: Data integrity - verify get_node works for all nodes
print("  Checking get_node() integrity...")
for node_id in created_node_ids:
    try:
        node = get_node(node_id, sentences_path=sentences_path, embeddings_path=embeddings_path, db_path=test_db_path)
        if node is None:
            errors.append(f"get_node({node_id}) returned None")
        elif node['context'] is None:
            errors.append(f"get_node({node_id}) has None context")
        elif node['usage'] is None:
            errors.append(f"get_node({node_id}) has None usage")
    except Exception as e:
        errors.append(f"get_node({node_id}) raised exception: {e}")

if errors:
    print(f"\n✗ Found {len(errors)} consistency errors:")
    for error in errors:
        print(f"  - {error}")
    raise AssertionError(f"Data consistency check failed with {len(errors)} errors")
else:
    print("\n✓ All consistency checks passed")


Running data consistency checks...
  Checking node-usage consistency...
  Checking sentence offset validity...
  Checking embedding index validity...
  Checking neighbor reference validity...
    Checked 3 edges
  Checking get_node() integrity...

✓ All consistency checks passed


## Test 10: Performance and Edge Cases


In [12]:
# Test edge cases
print("Testing edge cases...")

# Test 1: Get non-existent node
non_existent = get_node(99999, db_path=test_db_path)
assert non_existent is None, "Non-existent node should return None"
print("✓ Non-existent node handling: OK")

# Test 2: Update non-existent usage
success = update_usage(99999, access_count=1, db_path=test_db_path)
assert not success, "Updating non-existent node should return False"
print("✓ Non-existent usage update handling: OK")

# Test 3: Add duplicate edge (should update, not fail)
if len(created_node_ids) >= 2:
    u, v = created_node_ids[0], created_node_ids[1]
    success1 = add_graph_edge(u, v, 0.5, 0, db_path=test_db_path)
    success2 = add_graph_edge(u, v, 0.7, 0, db_path=test_db_path)  # Same edge, different weight
    assert success1 and success2, "Adding duplicate edge should succeed (update)"
    
    # Verify weight was updated
    node = get_node(u, db_path=test_db_path)
    edge = next((n for n in node['neighbors'] if n['v'] == v), None)
    assert edge is not None, "Edge should exist"
    assert abs(edge['weight'] - 0.7) < 0.01, "Edge weight should be updated"
    print("✓ Duplicate edge handling (update): OK")

# Test 4: Load sentence with invalid offset
invalid_sentence = load_sentence(sentences_path, 99999) if os.path.exists(sentences_path) else None
assert invalid_sentence is None, "Invalid sentence offset should return None"
print("✓ Invalid sentence offset handling: OK")

# Test 5: Load embedding with invalid index
invalid_embedding = load_embedding(embeddings_path, 99999) if os.path.exists(embeddings_path) else None
assert invalid_embedding is None, "Invalid embedding index should return None"
print("✓ Invalid embedding index handling: OK")

print("\n✓ All edge case tests passed")


Testing edge cases...
✓ Non-existent node handling: OK
✓ Non-existent usage update handling: OK
✓ Duplicate edge handling (update): OK
✓ Invalid sentence offset handling: OK
✓ Invalid embedding index handling: OK

✓ All edge case tests passed


## Summary

All storage system tests completed. The hybrid architecture (SQLite + JSONL + NumPy) is working correctly.


In [13]:
# Final summary
print("=" * 60)
print("STORAGE SYSTEM TEST SUMMARY")
print("=" * 60)
print(f"Database: {test_db_path}")
print(f"Nodes created: {len(created_node_ids)}")
print(f"Edges added: {Neighbors.select().count()}")
print(f"Usage records: {NodesUsage.select().count()}")
print("\nComponents tested:")
print("  ✓ Database initialization")
print("  ✓ Node creation")
print("  ✓ Node retrieval")
print("  ✓ Usage statistics updates")
print("  ✓ Graph edge operations")
print("  ✓ JSONL file operations")
print("  ✓ NumPy array operations")
print("  ✓ Data consistency")
print("  ✓ Edge cases")
print("\n✓ All tests passed!")


STORAGE SYSTEM TEST SUMMARY
Database: test_memory.db
Nodes created: 5
Edges added: 3
Usage records: 5

Components tested:
  ✓ Database initialization
  ✓ Node creation
  ✓ Node retrieval
  ✓ Usage statistics updates
  ✓ Graph edge operations
  ✓ JSONL file operations
  ✓ NumPy array operations
  ✓ Data consistency
  ✓ Edge cases

✓ All tests passed!


## Test 11: Test create_node Function (Full Workflow)


In [14]:
# Test create_node with new files (full workflow test)
print("Testing create_node() with new files...")

# Use separate test files for this test
test_create_sentences = "test_create_sentences.jsonl"
test_create_embeddings = "test_create_embeddings.npy"

# Clean up if exists
if os.path.exists(test_create_sentences):
    os.remove(test_create_sentences)
    print("  Cleaned up existing test_sentences.jsonl")
if os.path.exists(test_create_embeddings):
    os.remove(test_create_embeddings)
    print("  Cleaned up existing test_embeddings.npy")

# Verify files don't exist before creation
assert not os.path.exists(test_create_sentences), "Test sentences file should not exist"
assert not os.path.exists(test_create_embeddings), "Test embeddings file should not exist"

# Create a new node using create_node (this will create new files)
print("  Creating node with create_node()...")
test_embedding = np.random.rand(1536).astype(np.float32)
test_embedding_norm = np.linalg.norm(test_embedding)

new_node_id = create_node(
    sentence_text="This is a test sentence created by create_node function.",
    embedding_vector=test_embedding,
    metadata_dict={
        'id': 100,  # Use a high ID to avoid conflicts
        'source': 'test_create',
        'tag': 'test',
        'language': 'en',
        'initial_context': 'Test context',
    },
    sentences_path=test_create_sentences,
    embeddings_path=test_create_embeddings,
    db_path=test_db_path,
)

print(f"  ✓ Created new node {new_node_id}")

# Verify files were created
assert os.path.exists(test_create_sentences), "Test sentences file should be created"
assert os.path.exists(test_create_embeddings), "Test embeddings file should be created"
print("  ✓ Files created successfully")

# Verify file contents
sentence_count = count_sentences(test_create_sentences)
assert sentence_count == 1, f"Should have 1 sentence, got {sentence_count}"

emb_shape = get_embedding_shape(test_create_embeddings)
assert emb_shape is not None, "Embedding shape should be readable"
assert emb_shape[0] == 1, f"Should have 1 embedding, got {emb_shape[0]}"
assert emb_shape[1] == 1536, f"Should have 1536 dimensions, got {emb_shape[1]}"
print(f"  ✓ File contents verified (1 sentence, 1 embedding of {emb_shape[1]} dims)")

# Verify the node was created correctly
print("  Retrieving node with get_node()...")
new_node = get_node(
    new_node_id,
    sentences_path=test_create_sentences,
    embeddings_path=test_create_embeddings,
    db_path=test_db_path,
)

assert new_node is not None, "New node should exist"
assert new_node['context'] is not None, "Context should not be None"
assert new_node['usage'] is not None, "Usage should not be None"
assert new_node['context']['id'] == 100, f"Node ID should be 100, got {new_node['context']['id']}"
assert new_node['sentence'] is not None, "Sentence should be loaded"
assert new_node['embedding'] is not None, "Embedding should be loaded"
assert len(new_node['embedding']) == 1536, f"Embedding dimension should be 1536, got {len(new_node['embedding'])}"

# Verify embedding values match
retrieved_embedding = np.array(new_node['embedding'])
assert np.allclose(retrieved_embedding, test_embedding), "Embedding values should match"
print("  ✓ Embedding values match")

# Verify embedding norm
assert abs(new_node['context']['embedding_norm'] - test_embedding_norm) < 1e-5, \
    f"Embedding norm should match: {new_node['context']['embedding_norm']} vs {test_embedding_norm}"

# Verify metadata
assert new_node['context']['source'] == 'test_create', "Source should match"
assert new_node['context']['tag'] == 'test', "Tag should match"
assert new_node['context']['language'] == 'en', "Language should match"
assert new_node['context']['initial_context'] == 'Test context', "Initial context should match"

print(f"  Node ID: {new_node['context']['id']}")
print(f"  Sentence: {new_node['sentence'].get('text', '')[:50]}...")
print(f"  Embedding dim: {len(new_node['embedding'])}")
print(f"  Embedding norm: {new_node['context']['embedding_norm']:.6f}")
print(f"  Source: {new_node['context']['source']}")
print(f"  Tag: {new_node['context']['tag']}")
print("✓ create_node full workflow test passed")

# Test appending a second node
print("\n  Testing append functionality...")
second_embedding = np.random.rand(1536).astype(np.float32)
second_node_id = create_node(
    sentence_text="Second test sentence for append test.",
    embedding_vector=second_embedding,
    metadata_dict={
        'id': 101,
        'source': 'test_create',
        'tag': 'test_append',
    },
    sentences_path=test_create_sentences,
    embeddings_path=test_create_embeddings,
    db_path=test_db_path,
)

# Verify second node
assert second_node_id == 101, f"Second node ID should be 101, got {second_node_id}"
sentence_count_after = count_sentences(test_create_sentences)
assert sentence_count_after == 2, f"Should have 2 sentences after append, got {sentence_count_after}"

emb_shape_after = get_embedding_shape(test_create_embeddings)
assert emb_shape_after[0] == 2, f"Should have 2 embeddings after append, got {emb_shape_after[0]}"

second_node = get_node(101, sentences_path=test_create_sentences, 
                       embeddings_path=test_create_embeddings, db_path=test_db_path)
assert second_node is not None, "Second node should exist"
assert second_node['context']['sentence_offset'] == 1, "Second node should have offset 1"
assert second_node['context']['embedding_index'] == 1, "Second node should have embedding_index 1"
print("  ✓ Append functionality verified")

# Clean up test files
print("\n  Cleaning up test files...")
if os.path.exists(test_create_sentences):
    os.remove(test_create_sentences)
if os.path.exists(test_create_embeddings):
    os.remove(test_create_embeddings)
print("✓ Test files cleaned up")


Testing create_node() with new files...
  Creating node with create_node()...
  ✓ Created new node 100
  ✓ Files created successfully
  ✓ File contents verified (1 sentence, 1 embedding of 1536 dims)
  Retrieving node with get_node()...
  ✓ Embedding values match
  Node ID: 100
  Sentence: This is a test sentence created by create_node fun...
  Embedding dim: 1536
  Embedding norm: 22.553902
  Source: test_create
  Tag: test
✓ create_node full workflow test passed

  Testing append functionality...
  ✓ Append functionality verified

  Cleaning up test files...
✓ Test files cleaned up
