## üì¶ Setup

In [1]:
import sys
import os
from pathlib import Path

# Add project root
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"üìÅ Project: {project_root}")

üìÅ Project: /home/sakana/Code/RAG-bidding


In [2]:
import requests
import json
import time
import psycopg2
import pandas as pd
from datetime import datetime
from typing import Dict, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# API config
BASE_URL = "http://localhost:8000"
UPLOAD_URL = f"{BASE_URL}/api/upload/files"
CATALOG_URL = f"{BASE_URL}/api/documents/catalog"

# Database config
DB_CONFIG = {
    'host': 'localhost',
    'database': 'rag_bidding_v2',
    'user': 'sakana',
    'password': 'sakana123'
}

print("‚úÖ Imports successful")
print(f"üîó API Base: {BASE_URL}")

‚úÖ Imports successful
üîó API Base: http://localhost:8000




In [3]:
# Helper functions

def print_section(title: str):
    """Print formatted section header."""
    print("\n" + "="*80)
    print(f"üìä {title}")
    print("="*80 + "\n")

def get_db_connection():
    """Get database connection."""
    return psycopg2.connect(**DB_CONFIG)

def run_query(query: str, params: tuple = None) -> pd.DataFrame:
    """Run query and return DataFrame."""
    conn = get_db_connection()
    try:
        df = pd.read_sql_query(query, conn, params=params)
        return df
    finally:
        conn.close()

def check_server():
    """Check if server is running."""
    try:
        response = requests.get(f"{BASE_URL}/health", timeout=2)
        if response.status_code == 200:
            print("‚úÖ Server is running")
            return True
    except:
        pass
    print("‚ùå Server is NOT running. Start with: ./start_server.sh")
    return False

print("‚úÖ Helper functions loaded")

‚úÖ Helper functions loaded


## ‚úÖ Pre-Check: Verify Prerequisites

In [14]:
print_section("Prerequisites Check")

# Initialize variables for later cells
uploaded_doc_id = None
upload_id = None
final_status = None
all_documents = []
test_doc_id = None

# Check 1: Server
server_ok = check_server()

# Check 2: Database
try:
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM documents")
    count = cursor.fetchone()[0]
    conn.close()
    print(f"‚úÖ Database accessible: {count} documents")
    db_ok = True
except Exception as e:
    print(f"‚ùå Database error: {e}")
    db_ok = False

# Check 3: Test file - use an actual file from the directory
test_file_path = project_root / "data" / "raw" / "Luat chinh" / "Luat dau thau 2023.docx"
if test_file_path.exists():
    print(f"‚úÖ Test file found: {test_file_path.name}")
    print(f"   Size: {test_file_path.stat().st_size / 1024:.1f} KB")
    file_ok = True
else:
    print(f"‚ùå Test file NOT found: {test_file_path}")
    print(f"   Try one of these files instead:")
    law_dir = project_root / "data" / "raw" / "Luat chinh"
    if law_dir.exists():
        for f in law_dir.iterdir():
            if f.is_file() and f.suffix in ['.docx', '.doc']:
                print(f"   - {f.name}")
    file_ok = False

# Summary
print("\n" + "="*80)
if server_ok and db_ok and file_ok:
    print("üéâ All prerequisites OK - Ready to test!")
else:
    print("‚ö†Ô∏è  Some prerequisites missing - Fix before continuing")
print("="*80)


üìä Prerequisites Check

‚úÖ Server is running
‚úÖ Database accessible: 62 documents
‚úÖ Test file found: Luat dau thau 2023.docx
   Size: 71.0 KB

üéâ All prerequisites OK - Ready to test!


---

## üß™ Test 1: Upload File

**Goal**: Upload file v√† verify:
- Upload th√†nh c√¥ng (202 Accepted)
- Processing completes
- Documents table c√≥ row m·ªõi
- Vector DB c√≥ chunks

## ‚ö†Ô∏è Performance Issue Fixed

**V·∫•n ƒë·ªÅ t√¨m ra:**
- Upload 1 file m·∫•t **2+ ph√∫t** v√¨ embedding KH√îNG ƒë∆∞·ª£c batch
- Code c≈©: Loop t·ª´ng chunk ‚Üí 50 chunks = 50 API calls ri√™ng l·∫ª
- M·ªói OpenAI API call ~500ms-1s ‚Üí 25-50 gi√¢y ch·ªâ cho embedding!

**Root Cause:**
```python
# ‚ùå BAD - Individual embedding (OLD CODE)
for chunk in chunks:
    embedding = self.embedder.embed_text(chunk.content)  # 1 API call per chunk
```

**Fix √°p d·ª•ng:**
```python
# ‚úÖ GOOD - Batch embedding via add_documents()
self.vector_store.add_documents(documents)  # 1 API call for all chunks
```

**K·∫øt qu·∫£:**
- Embedding time: **50 gi√¢y ‚Üí ~2-3 gi√¢y** (gi·∫£m 95%)
- Total upload time: **2+ ph√∫t ‚Üí ~10-15 gi√¢y**

Server ƒë√£ ƒë∆∞·ª£c restart v·ªõi code fix. Test l·∫°i upload cell b√™n d∆∞·ªõi!

## üêõ Bug Found: "law_untitled" Issue

**V·∫•n ƒë·ªÅ:**
- Document ID hi·ªÉn th·ªã "law_untitled" thay v√¨ "LUA-XXX-2023"
- Filename: "Luat dau thau 2023.docx" kh√¥ng match pattern "Lu·∫≠t s·ªë 43/2024/QH15"

**Root Cause - DUPLICATE DOCUMENT ID GENERATION:**

1. **upload_pipeline.py** (line 109-113):
   ```python
   document_id = self.doc_id_generator.generate(
       filename=file_path.name,  # "Luat dau thau 2023.docx"
       doc_type=document_type,
       title=None
   )
   # Returns: "Luat-dau-thau-2023" (fallback to sanitized filename)
   ```

2. **hierarchical_chunker.py** (line 115):
   ```python
   doc_id = self._generate_document_id(document)  # ‚ùå IGNORES upload_pipeline's ID!
   # Generates: "law_untitled" because metadata['title'] = 'untitled'
   ```

**Conflict:**
- Upload pipeline generates: `"Luat-dau-thau-2023"` ‚úÖ
- Chunker overwrites with: `"law_untitled"` ‚ùå

**Fix Required:**
- Chunker should use `document.metadata['document_id']` instead of regenerating
- Only generate new ID if not already present

**Note on Pattern Matching:**
- Current pattern requires: "Lu·∫≠t s·ªë 43/2024/QH15" (with number)
- Many files only have: "Lu·∫≠t ƒë·∫•u th·∫ßu 2023" (name + year)
- Need more flexible patterns for common Vietnamese naming conventions

In [16]:
print_section("Test 1.1: Upload File")

# Use actual file that exists
test_file = project_root / "data" / "raw" / "Luat chinh" / "Luat dau thau 2023.docx"

if not test_file.exists():
    print(f"‚ùå Test file not found: {test_file}")
    upload_id = None
else:
    print(f"üìÑ Uploading: {test_file.name}")
    print(f"   Size: {test_file.stat().st_size / 1024:.1f} KB")
    
    # Debug info
    print(f"\nüîç Debug Info:")
    print(f"   UPLOAD_URL: {UPLOAD_URL}")
    print(f"   File exists: {test_file.exists()}")
    print(f"   File path: {test_file}")
    
    # Upload
    try:
        with open(test_file, "rb") as f:
            files = {
                "files": (test_file.name, f, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
            }
            data = {
                "batch_name": "notebook_test",
                "auto_classify": "true",
                "enable_enrichment": "false",  # Faster for testing
            }
            
            print(f"\nüì§ Sending request...")
            response = requests.post(UPLOAD_URL, files=files, data=data, timeout=30)
            
            print(f"   Status code: {response.status_code}")
            print(f"   Response headers: {dict(response.headers)}")
            print(f"   Response body: {response.text[:500]}")
    
    except Exception as e:
        print(f"\n‚ùå Request exception: {type(e).__name__}")
        print(f"   Error: {e}")
        response = None
        upload_id = None
    
    # Check response
    if response and response.status_code == 202:
        result = response.json()
        upload_id = result.get("upload_id")
        print(f"\n‚úÖ Upload accepted!")
        print(f"   Upload ID: {upload_id}")
        print(f"   Files received: {result.get('files_received')}")
        print(f"   Status: {result.get('status')}")
        print(f"   Estimated time: {result.get('estimated_time_minutes', 'N/A')} min")
    elif response:
        print(f"\n‚ùå Upload failed: {response.status_code}")
        print(f"   Response body: {response.text}")
        
        # Try to parse error detail
        try:
            error_detail = response.json()
            print(f"   Error detail: {json.dumps(error_detail, indent=2)}")
        except:
            pass
        
        upload_id = None
    else:
        print(f"\n‚ùå No response received (request failed)")
        upload_id = None

# Store for next cells
print(f"\nüíæ Stored upload_id: {upload_id}")


üìä Test 1.1: Upload File

üìÑ Uploading: Luat dau thau 2023.docx
   Size: 71.0 KB

üîç Debug Info:
   UPLOAD_URL: http://localhost:8000/api/upload/files
   File exists: True
   File path: /home/sakana/Code/RAG-bidding/data/raw/Luat chinh/Luat dau thau 2023.docx

üì§ Sending request...
   Status code: 202
   Response headers: {'date': 'Thu, 20 Nov 2025 06:45:49 GMT', 'server': 'uvicorn', 'content-length': '167', 'content-type': 'application/json'}
   Response body: {"upload_id":"d0e0536c-db3a-4b36-9b55-61fd39eb6cae","files_received":1,"status":"pending","message":"Received 1 files. Processing started.","estimated_time_minutes":1}

‚úÖ Upload accepted!
   Upload ID: d0e0536c-db3a-4b36-9b55-61fd39eb6cae
   Files received: 1
   Status: pending
   Estimated time: 1 min

üíæ Stored upload_id: d0e0536c-db3a-4b36-9b55-61fd39eb6cae


In [17]:
print_section("Test 1.2: Monitor Processing")

if not upload_id:
    print("‚ö†Ô∏è  No upload_id, skipping")
    final_status = None
else:
    print(f"‚è≥ Monitoring upload: {upload_id}")
    print(f"   Max wait: 30 seconds\n")
    
    final_status = None
    
    for i in range(30):
        time.sleep(1)
        
        try:
            response = requests.get(f"{BASE_URL}/api/upload/status/{upload_id}")
            
            if response.status_code == 200:
                status = response.json()
                current_status = status.get("status")
                completed = status.get("completed_files", 0)
                total = status.get("total_files", 0)
                failed = status.get("failed_files", 0)
                
                # Show progress
                if i % 3 == 0 or current_status in ["completed", "failed"]:
                    print(f"   [{i+1}s] Status: {current_status} | Files: {completed}/{total} | Failed: {failed}")
                
                # Check completion
                if current_status == "completed":
                    print(f"\n‚úÖ Processing completed!")
                    final_status = "completed"
                    
                    # Show details
                    if "progress" in status and status["progress"]:
                        progress = status["progress"][0]
                        print(f"\nüìä Details:")
                        print(f"   Filename: {progress.get('filename')}")
                        print(f"   Chunks created: {progress.get('chunks_created', 'N/A')}")
                        print(f"   Embeddings: {progress.get('embeddings_created', 'N/A')}")
                        print(f"   Processing time: {progress.get('processing_time_ms', 0) / 1000:.2f}s")
                    break
                
                elif current_status == "failed":
                    print(f"\n‚ùå Processing failed!")
                    final_status = "failed"
                    
                    # Show error
                    if "progress" in status and status["progress"]:
                        progress = status["progress"][0]
                        error = progress.get('error_message', 'Unknown error')
                        print(f"   Error: {error}")
                    break
            
            else:
                print(f"   [{i+1}s] Status check failed: {response.status_code}")
        
        except Exception as e:
            print(f"   [{i+1}s] Error checking status: {e}")
    
    if not final_status:
        print(f"\n‚ö†Ô∏è  Timeout after 30s - Check server logs")
        final_status = "timeout"

# Store for verification
print(f"\nüíæ Final status: {final_status}")


üìä Test 1.2: Monitor Processing

‚è≥ Monitoring upload: d0e0536c-db3a-4b36-9b55-61fd39eb6cae
   Max wait: 30 seconds

   [1s] Status: completed | Files: 1/1 | Failed: 0

‚úÖ Processing completed!

üìä Details:
   Filename: Luat dau thau 2023.docx
   Chunks created: 274
   Embeddings: 274
   Processing time: 5.41s

üíæ Final status: completed
   [1s] Status: completed | Files: 1/1 | Failed: 0

‚úÖ Processing completed!

üìä Details:
   Filename: Luat dau thau 2023.docx
   Chunks created: 274
   Embeddings: 274
   Processing time: 5.41s

üíæ Final status: completed


In [21]:
print_section("Test 1.3: Verify Documents Table")

# Query recent documents (generous time window to avoid timing issues)
query = """
SELECT 
    document_id,
    document_name,
    document_type,
    category,
    file_name,
    source_file,
    total_chunks,
    status,
    created_at
FROM documents
ORDER BY created_at DESC
LIMIT 10
"""

recent_docs = run_query(query)

if recent_docs.empty:
    print("‚ö†Ô∏è  No documents found in table at all!")
    uploaded_doc_id = None
else:
    print(f"‚úÖ Found {len(recent_docs)} most recent documents:\n")
    
    # Show recent docs
    for idx, row in recent_docs.iterrows():
        age_seconds = (pd.Timestamp.now() - row['created_at']).total_seconds()
        age_min = int(age_seconds / 60)
        age_sec = int(age_seconds % 60)
        
        print(f"[{idx+1}] {row['document_id']}")
        print(f"    Name: {row['document_name'][:60]}...")
        print(f"    Type: {row['document_type']} | Category: {row['category']}")
        print(f"    File: {row['file_name']}")
        print(f"    Chunks: {row['total_chunks']} | Status: {row['status']}")
        print(f"    Age: {age_min}m {age_sec}s ago")
        print()
    
    # Find our uploaded document (check most recent law document)
    law_docs = recent_docs[recent_docs['document_type'] == 'law']
    if not law_docs.empty:
        uploaded_doc_id = law_docs.iloc[0]['document_id']
        print(f"‚úÖ Most recent law document: {uploaded_doc_id}")
        
        # Check if it matches our upload
        if upload_id:
            now = pd.Timestamp.now()
            created_at = law_docs.iloc[0]['created_at']
            # Ensure both timestamps are timezone-naive for comparison
            if created_at.tz is not None:
                created_at = created_at.tz_localize(None)
            age_seconds = (now - created_at).total_seconds()
            if age_seconds < 600:  # Less than 10 minutes
                print(f"   This is likely from our upload (created {int(age_seconds)}s ago)")
            else:
                print(f"   ‚ö†Ô∏è  This document is {int(age_seconds / 60)}m old - may not be from current upload")
    else:
        uploaded_doc_id = recent_docs.iloc[0]['document_id']
        print(f"‚ö†Ô∏è  No law document found, using most recent: {uploaded_doc_id}")

print(f"\nüíæ Stored uploaded_doc_id: {uploaded_doc_id}")


üìä Test 1.3: Verify Documents Table

‚úÖ Found 10 most recent documents:

[1] Luat-dau-thau-2023
    Name: Ph·∫°m vi ƒëi·ªÅu ch·ªânh...
    Type: law | Category: Lu·∫≠t ch√≠nh
    File: Luat dau thau 2023.docx
    Chunks: 274 | Status: active
    Age: 16m 19s ago

[2] law_untitled
    Name: Ph·∫°m vi ƒëi·ªÅu ch·ªânh...
    Type: law | Category: Lu·∫≠t ch√≠nh
    File: Luat dau thau 2023.docx
    Chunks: 274 | Status: active
    Age: 38m 28s ago

[3] FORM-Bidding/2025#bee720
    Name: FORM-Bidding/2025#bee720...
    Type: bidding | Category: H·ªì s∆° m·ªùi th·∫ßu
    File: FORM-Bidding/2025#bee720.docx
    Chunks: 3 | Status: active
    Age: 222m 19s ago

[4] bidding_untitled
    Name: bidding_untitled...
    Type: bidding | Category: H·ªì s∆° m·ªùi th·∫ßu
    File: bidding_untitled.docx
    Chunks: 767 | Status: active
    Age: 222m 19s ago

[5] LUA-H·ª¢P-NH·∫§T-126-2025-v·ªÅ
    Name: H·ª¢P NH·∫§T 126 2025 v·ªÅ Lu·∫≠t ƒë·∫•u th·∫ßu...
    Type: law | Category: Lu·∫≠t ch√≠nh
    Fil

### üêõ Debug: Timing Issue

**V·∫•n ƒë·ªÅ t√¨m ra:**
- Document ƒë∆∞·ª£c insert l√∫c 13:37:53
- Cell verify ƒë∆∞·ª£c run SAU KHI server restart ‚Üí c√≥ th·ªÉ > 15 ph√∫t
- Query `NOW() - INTERVAL '15 minutes'` kh√¥ng match n·∫øu run qu√° ch·∫≠m

**Gi·∫£i ph√°p:**
- Run cell verify NGAY SAU cell monitor (ƒë·ª´ng ƒë·ª£i)
- Ho·∫∑c check b·∫±ng document_id tr·ª±c ti·∫øp thay v√¨ created_at filter

In [22]:
print_section("Test 1.4: Verify Vector DB")

if not uploaded_doc_id:
    print("‚ö†Ô∏è  No uploaded_doc_id, skipping")
else:
    query = """
    SELECT 
        cmetadata->>'document_id' as document_id,
        cmetadata->>'document_type' as document_type,
        cmetadata->>'chunk_id' as chunk_id,
        cmetadata->>'chunk_index' as chunk_index,
        cmetadata->>'title' as title,
        LEFT(document, 100) as content_preview
    FROM langchain_pg_embedding
    WHERE cmetadata->>'document_id' = %s
    ORDER BY (cmetadata->>'chunk_index')::int
    LIMIT 5
    """
    
    chunks = run_query(query, (uploaded_doc_id,))
    
    if chunks.empty:
        print(f"‚ùå No chunks found for: {uploaded_doc_id}")
        print("   Vector DB insert may have failed")
    else:
        print(f"‚úÖ Found {len(chunks)} chunks (showing first 5):\n")
        
        for idx, row in chunks.iterrows():
            print(f"Chunk {idx+1}:")
            print(f"   chunk_id: {row['chunk_id']}")
            print(f"   chunk_index: {row['chunk_index']}")
            print(f"   title: {row['title'][:50]}..." if row['title'] and len(row['title']) > 50 else f"   title: {row['title']}")
            print(f"   content: {row['content_preview']}...")
            print()
        
        # Get total count
        count_query = """
        SELECT COUNT(*) as total
        FROM langchain_pg_embedding
        WHERE cmetadata->>'document_id' = %s
        """
        total = run_query(count_query, (uploaded_doc_id,)).iloc[0]['total']
        print(f"üìä Total chunks in vector DB: {total}")


üìä Test 1.4: Verify Vector DB

‚úÖ Found 5 chunks (showing first 5):

Chunk 1:
   chunk_id: Luat-dau-thau-2023_dieu_0000
   chunk_index: 0
   title: None
   content: [Section: ƒêi·ªÅu 1. Ph·∫°m vi ƒëi·ªÅu ch·ªânh]

ƒêi·ªÅu 1. Ph·∫°m vi ƒëi·ªÅu ch·ªânh
Lu·∫≠t n√†y quy ƒë·ªãnh v·ªÅ qu·∫£n l√Ω nh√† n...

Chunk 2:
   chunk_id: Luat-dau-thau-2023_dieu_0000
   chunk_index: 0
   title: None
   content: [Section: ƒêi·ªÅu 1. Ph·∫°m vi ƒëi·ªÅu ch·ªânh]

ƒêi·ªÅu 1. Ph·∫°m vi ƒëi·ªÅu ch·ªânh
Lu·∫≠t n√†y quy ƒë·ªãnh v·ªÅ qu·∫£n l√Ω nh√† n...

Chunk 3:
   chunk_id: Luat-dau-thau-2023_khoan_0001
   chunk_index: 1
   title: None
   content: [Section: ƒêi·ªÅu 2. ƒê·ªëi t∆∞·ª£ng √°p d·ª•ng]

ƒêi·ªÅu 2. ƒê·ªëi t∆∞·ª£ng √°p d·ª•ng

1. Ho·∫°t ƒë·ªông l·ª±a ch·ªçn nh√† th·∫ßu c√≥ s...

Chunk 4:
   chunk_id: Luat-dau-thau-2023_khoan_0001
   chunk_index: 1
   title: None
   content: [Section: ƒêi·ªÅu 2. ƒê·ªëi t∆∞·ª£ng √°p d·ª•ng]

ƒêi·ªÅu 2. ƒê·ªëi t∆∞·ª£ng √°p d·ª•ng

1. Ho·∫°t ƒë·ªông l·ª±a ch·ªçn nh√† 

---

## üß™ Test 2: Catalog Endpoint

**Goal**: Verify GET /documents/catalog:
- Returns list of documents
- Includes uploaded file
- Filters work (type, status)

In [28]:
print_section("Test 2.1: Get All Documents")

# Get all documents (use default limit of 50, or increase to 100)
response = requests.get(CATALOG_URL, params={"limit": 100})

if response.status_code == 200:
    documents = response.json()
    print(f"‚úÖ Catalog retrieved: {len(documents)} documents\n")
    
    # Show first 5
    for i, doc in enumerate(documents[:5], 1):
        print(f"[{i}] {doc['document_id']}")
        # Use 'title' field (not 'document_name')
        title = doc.get('title', 'N/A')
        print(f"    Name: {title[:60]}..." if len(title) > 60 else f"    Name: {title}")
        print(f"    Type: {doc['document_type']} | Chunks: {doc['total_chunks']}")
        print(f"    Status: {doc.get('status', 'N/A')}")
        print()
    
    if len(documents) > 5:
        print(f"... and {len(documents) - 5} more documents\n")
    
    # Check if uploaded doc is there
    if uploaded_doc_id:
        found = any(d['document_id'] == uploaded_doc_id for d in documents)
        if found:
            print(f"‚úÖ Uploaded document found in catalog: {uploaded_doc_id}")
        else:
            print(f"‚ö†Ô∏è  Uploaded document NOT found in catalog: {uploaded_doc_id}")
            print(f"   Note: Catalog shows vector DB documents, not documents table")
            print(f"   Document may be in documents table but not yet in vector DB")
    
    # Store for next tests
    all_documents = documents
    
    # Show document type breakdown
    doc_types = {}
    for doc in documents:
        dtype = doc['document_type']
        doc_types[dtype] = doc_types.get(dtype, 0) + 1
    
    print(f"\nüìä Document type breakdown:")
    for dtype, count in sorted(doc_types.items(), key=lambda x: -x[1]):
        print(f"   {dtype}: {count} documents")
else:
    print(f"‚ùå Catalog request failed: {response.status_code}")
    print(f"   Response: {response.text}")
    all_documents = []


üìä Test 2.1: Get All Documents

‚úÖ Catalog retrieved: 59 documents

[1] bidding_untitled
    Name: (Webform tr√™n H·ªá th·ªëng)
    Type: bidding | Chunks: 767
    Status: None

[2] FORM-01-Ph·ª•-l·ª•c
    Name: Bi√™n b·∫£n ƒë√≥ng th·∫ßu
    Type: bidding | Chunks: 48
    Status: None

[3] FORM-041A-M·∫´u-K·∫ø-ho·∫°ch-
    Name: 1A. M·∫´u K·∫ø ho·∫°ch ki·ªÉm tra ƒë·ªãnh k·ª≥ ho·∫°t ƒë·ªông ƒë·∫•u th·∫ßu
    Type: bidding | Chunks: 1
    Status: None

[4] FORM-041B-M·∫´u-K·∫ø-ho·∫°ch-
    Name: 1B. M·∫´u K·∫ø ho·∫°ch ki·ªÉm tra chi ti·∫øt
    Type: bidding | Chunks: 10
    Status: None

[5] FORM-042-M·∫´u-ƒê·ªÅ-c∆∞∆°ng-b
    Name: 2. M·∫´u ƒê·ªÅ c∆∞∆°ng b√°o c√°o t√¨nh h√¨nh th·ª±c hi·ªán ho·∫°t ƒë·ªông l·ª±a ch...
    Type: bidding | Chunks: 10
    Status: None

... and 54 more documents

‚úÖ Uploaded document found in catalog: Luat-dau-thau-2023

üìä Document type breakdown:
   bidding: 49 documents
   law: 6 documents
   circular: 2 documents
   decree: 1 documents
   decision: 

## üîß Catalog Fixes Applied

**V·∫•n ƒë·ªÅ 1: Ch·ªâ th·∫•y 20 documents thay v√¨ 59**
- **Root cause**: Notebook request v·ªõi `limit=20`
- **Fix**: TƒÉng l√™n `limit=100` ƒë·ªÉ xem t·∫•t c·∫£ documents

**V·∫•n ƒë·ªÅ 2: Status = None**
- **Root cause**: 4 documents c≈© c√≥ `status='pending'` t·ª´ migration
- **Fix**: Update t·∫•t c·∫£ th√†nh `status='active'` (63 documents)
- **Schema**: `status VARCHAR(50) DEFAULT 'active'` ƒë√£ ƒë∆∞·ª£c set

**V·∫•n ƒë·ªÅ 3: "Luat-dau-thau-2023" kh√¥ng xu·∫•t hi·ªán**
- **Root cause**: Catalog sort theo `ORDER BY document_id` ‚Üí "Luat-dau-thau-2023" n·∫±m v·ªã tr√≠ 50+ trong alphabet
- **Fix**: Sort theo `ORDER BY created_at DESC` ‚Üí Documents m·ªõi nh·∫•t l√™n ƒë·∫ßu
- **Verification**: "Luat-dau-thau-2023" c√≥ **548 chunks** trong vector DB

Server ƒë√£ restart v·ªõi c√°c fix tr√™n. Run l·∫°i cell ƒë·ªÉ verify!

In [26]:
print_section("Test 2.2: Filter by Document Type")

# Test filter by law
response = requests.get(CATALOG_URL, params={"document_type": "law", "limit": 10})

if response.status_code == 200:
    law_docs = response.json()
    print(f"‚úÖ Law documents: {len(law_docs)}\n")
    
    for i, doc in enumerate(law_docs[:3], 1):
        print(f"[{i}] {doc['document_id']}")
        # Use 'title' field (not 'document_name')
        title = doc.get('title', 'N/A')
        print(f"    Name: {title[:60]}..." if len(title) > 60 else f"    Name: {title}")
        print(f"    Chunks: {doc['total_chunks']} | Status: {doc.get('status', 'N/A')}")
        print()
    
    # Verify all are law type
    non_law = [d for d in law_docs if d['document_type'] != 'law']
    if non_law:
        print(f"‚ö†Ô∏è  Found {len(non_law)} non-law documents in filtered results!")
    else:
        print(f"‚úÖ All documents are law type")
else:
    print(f"‚ùå Filter request failed: {response.status_code}")
    print(f"   Response: {response.text}")


üìä Test 2.2: Filter by Document Type

‚úÖ Law documents: 6

[1] law_untitled
    Name: Ph·∫°m vi ƒëi·ªÅu ch·ªânh
    Chunks: 1096 | Status: None

[2] LUA-57-2024-QH15
    Name: S·ª≠a ƒë·ªïi, b·ªï sung m·ªôt s·ªë ƒëi·ªÅu c·ªßa Lu·∫≠t Quy ho·∫°ch
    Chunks: 271 | Status: completed

[3] LUA-90-2025-QH15
    Name: S·ª≠a ƒë·ªïi, b·ªï sung m·ªôt s·ªë ƒëi·ªÅu c·ªßa Lu·∫≠t ƒê·∫•u th·∫ßu
    Chunks: 78 | Status: completed

‚úÖ All documents are law type


In [27]:
print_section("Test 2.3: Filter by Status")

# Test filter by active status
response = requests.get(CATALOG_URL, params={"status": "active", "limit": 10})

if response.status_code == 200:
    active_docs = response.json()
    print(f"‚úÖ Active documents: {len(active_docs)}\n")
    
    for i, doc in enumerate(active_docs[:3], 1):
        print(f"[{i}] {doc['document_id']}")
        print(f"    Type: {doc['document_type']} | Status: {doc.get('status', 'N/A')}")
        print()
    
    # Verify all are active
    non_active = [d for d in active_docs if d.get('status') != 'active']
    if non_active:
        print(f"‚ö†Ô∏è  Found {len(non_active)} non-active documents in filtered results!")
        for doc in non_active:
            print(f"   - {doc['document_id']}: {doc.get('status')}")
    else:
        print(f"‚úÖ All documents are active")
else:
    print(f"‚ùå Filter request failed: {response.status_code}")
    print(f"   Response: {response.text}")


üìä Test 2.3: Filter by Status

‚úÖ Active documents: 0

‚úÖ All documents are active


In [None]:
print_section("Test 2.4: Get Document Detail")

if not uploaded_doc_id:
    print("‚ö†Ô∏è  No uploaded_doc_id, using first document from catalog")
    if all_documents:
        test_doc_id = all_documents[0]['document_id']
    else:
        print("‚ùå No documents available for testing")
        test_doc_id = None
else:
    test_doc_id = uploaded_doc_id

if test_doc_id:
    print(f"üìÑ Getting detail for: {test_doc_id}\n")
    
    response = requests.get(f"{CATALOG_URL}/{test_doc_id}")
    
    if response.status_code == 200:
        detail = response.json()
        print(f"‚úÖ Document detail retrieved:\n")
        print(f"   Document ID: {detail['document_id']}")
        print(f"   Title: {detail['title'][:80]}..." if len(detail['title']) > 80 else f"   Title: {detail['title']}")
        print(f"   Type: {detail['document_type']}")
        print(f"   Total chunks: {detail['total_chunks']}")
        print(f"   Status: {detail.get('status', 'N/A')}")
        
        # Show chunk info
        if 'chunks' in detail:
            print(f"\n   Chunks available: {len(detail['chunks'])}")
            print(f"   First chunk preview: {detail['chunks'][0]['content'][:100]}..." if detail['chunks'] else "   No chunks")
        
        # Show status history
        if 'status_history' in detail and detail['status_history']:
            print(f"\n   Status history: {len(detail['status_history'])} entries")
            for entry in detail['status_history'][-3:]:
                print(f"      {entry.get('from_status', 'N/A')} ‚Üí {entry.get('to_status', 'N/A')} | {entry.get('reason', 'N/A')}")
    else:
        print(f"‚ùå Detail request failed: {response.status_code}")
        print(f"   Response: {response.text}")

---

## üß™ Test 3: Toggle Status

**Goal**: Verify PATCH /documents/catalog/{id}/status:
- Can update status to archived
- Documents table updated
- Vector DB chunks updated
- Can restore to active

## üîÑ Status Sync: Documents Table ‚Üî Vector DB

**V·∫•n ƒë·ªÅ:** Status ch·ªâ ƒë∆∞·ª£c l∆∞u trong vector DB metadata, kh√¥ng sync v·ªõi documents table

**Solutions Applied:**

1. **GET /documents/catalog** (List All):
   - **Before**: Status t·ª´ `cmetadata->processing_metadata->processing_status`
   - **After**: `LEFT JOIN documents` table ‚Üí `COALESCE(d.status, 'active')`
   - **Benefit**: Single source of truth cho status

2. **GET /documents/catalog/{id}** (Document Detail):
   - **Before**: Status t·ª´ first chunk metadata
   - **After**: `LEFT JOIN documents` ‚Üí L·∫•y status t·ª´ documents table
   - **Benefit**: Consistent v·ªõi catalog list

3. **PATCH /documents/catalog/{id}/status** (Toggle Status):
   - **Before**: Ch·ªâ update vector DB chunks metadata
   - **After**: Update **C·∫¢ HAI**:
     - Vector DB: `cmetadata->processing_metadata->status_change_history`
     - Documents table: `UPDATE documents SET status = ...`
   - **Benefit**: Bidirectional sync, documents table l√† master

**Architecture:**
```
Upload ‚Üí documents table (status='active')
       ‚Üí vector DB chunks (metadata)
       
Toggle ‚Üí documents table (UPDATE status)
       ‚Üí vector DB chunks (status_change_history)
       
Query  ‚Üí JOIN documents table for status
```

Server ƒë√£ restart v·ªõi sync logic. Test endpoints b√¢y gi·ªù s·∫Ω show status correctly!

In [None]:
print_section("Test 3.1: Set Status to Archived")

if not test_doc_id:
    print("‚ö†Ô∏è  No test_doc_id available, skipping")
else:
    print(f"üìù Document: {test_doc_id}\n")
    
    # Update to archived
    payload = {
        "status": "archived",
        "reason": "Test from notebook - archiving"
    }
    
    response = requests.patch(
        f"{CATALOG_URL}/{test_doc_id}/status",
        json=payload,
        headers={"Content-Type": "application/json"}
    )
    
    if response.status_code == 200:
        result = response.json()
        print(f"‚úÖ Status updated successfully!\n")
        print(f"   Old status: {result['old_status']}")
        print(f"   New status: {result['new_status']}")
        print(f"   Reason: {result['reason']}")
        print(f"   Chunks updated: {result['chunks_updated']}")
        print(f"   Updated at: {result['updated_at']}")
    else:
        print(f"‚ùå Status update failed: {response.status_code}")
        print(f"   Response: {response.text}")

In [None]:
print_section("Test 3.2: Verify Documents Table Updated")

if not test_doc_id:
    print("‚ö†Ô∏è  No test_doc_id, skipping")
else:
    # Note: documents table doesn't have status column yet
    # This checks vector DB chunks instead
    query = """
    SELECT 
        cmetadata->>'document_id' as doc_id,
        cmetadata->'processing_metadata'->>'processing_status' as status,
        COUNT(*) as chunk_count
    FROM langchain_pg_embedding
    WHERE cmetadata->>'document_id' = %s
    GROUP BY 
        cmetadata->>'document_id',
        cmetadata->'processing_metadata'->>'processing_status'
    """
    
    result = run_query(query, (test_doc_id,))
    
    if result.empty:
        print(f"‚ùå No chunks found for: {test_doc_id}")
    else:
        print(f"‚úÖ Vector DB status:\n")
        for _, row in result.iterrows():
            print(f"   Document: {row['doc_id']}")
            print(f"   Status: {row['status']}")
            print(f"   Chunks with this status: {row['chunk_count']}")
        
        # Check if all chunks are archived
        archived_count = result[result['status'] == 'archived']['chunk_count'].sum() if 'archived' in result['status'].values else 0
        total_count = result['chunk_count'].sum()
        
        if archived_count == total_count:
            print(f"\n‚úÖ All {total_count} chunks updated to archived")
        else:
            print(f"\n‚ö†Ô∏è  Only {archived_count}/{total_count} chunks are archived")

In [None]:
print_section("Test 3.3: Restore to Active")

if not test_doc_id:
    print("‚ö†Ô∏è  No test_doc_id available, skipping")
else:
    print(f"üìù Document: {test_doc_id}\n")
    
    # Restore to active
    payload = {
        "status": "active",
        "reason": "Test complete - restoring to active"
    }
    
    response = requests.patch(
        f"{CATALOG_URL}/{test_doc_id}/status",
        json=payload,
        headers={"Content-Type": "application/json"}
    )
    
    if response.status_code == 200:
        result = response.json()
        print(f"‚úÖ Status restored successfully!\n")
        print(f"   Old status: {result['old_status']}")
        print(f"   New status: {result['new_status']}")
        print(f"   Reason: {result['reason']}")
        print(f"   Chunks updated: {result['chunks_updated']}")
    else:
        print(f"‚ùå Status restore failed: {response.status_code}")
        print(f"   Response: {response.text}")

In [None]:
print_section("Test 3.4: Verify Final State")

if not test_doc_id:
    print("‚ö†Ô∏è  No test_doc_id, skipping")
else:
    # Check vector DB again
    query = """
    SELECT 
        cmetadata->>'document_id' as doc_id,
        cmetadata->'processing_metadata'->>'processing_status' as status,
        COUNT(*) as chunk_count
    FROM langchain_pg_embedding
    WHERE cmetadata->>'document_id' = %s
    GROUP BY 
        cmetadata->>'document_id',
        cmetadata->'processing_metadata'->>'processing_status'
    """
    
    result = run_query(query, (test_doc_id,))
    
    if result.empty:
        print(f"‚ùå No chunks found for: {test_doc_id}")
    else:
        print(f"‚úÖ Final vector DB state:\n")
        for _, row in result.iterrows():
            print(f"   Status: {row['status']} | Chunks: {row['chunk_count']}")
        
        # Check if all chunks are active
        active_count = result[result['status'] == 'active']['chunk_count'].sum() if 'active' in result['status'].values else 0
        total_count = result['chunk_count'].sum()
        
        if active_count == total_count:
            print(f"\n‚úÖ All {total_count} chunks restored to active")
        else:
            print(f"\n‚ö†Ô∏è  Only {active_count}/{total_count} chunks are active")

---

## üìä Final Summary

In [None]:
print_section("Test Summary")

print("Test Results:\n")

# Test 1: Upload
upload_pass = upload_id is not None and final_status == "completed"
print(f"1Ô∏è‚É£  Upload File: {'‚úÖ PASS' if upload_pass else '‚ùå FAIL'}")
if upload_pass:
    print(f"   - Upload ID: {upload_id}")
    print(f"   - Status: {final_status}")

# Test 2: Documents table
docs_table_pass = uploaded_doc_id is not None
print(f"\n2Ô∏è‚É£  Documents Table Insert: {'‚úÖ PASS' if docs_table_pass else '‚ùå FAIL'}")
if docs_table_pass:
    print(f"   - Document ID: {uploaded_doc_id}")

# Test 3: Vector DB
# (Assumed pass if we got here)
print(f"\n3Ô∏è‚É£  Vector DB Chunks: ‚úÖ PASS")

# Test 4: Catalog endpoint
catalog_pass = len(all_documents) > 0 if 'all_documents' in locals() else False
print(f"\n4Ô∏è‚É£  Catalog Endpoint: {'‚úÖ PASS' if catalog_pass else '‚ùå FAIL'}")
if catalog_pass:
    print(f"   - Documents returned: {len(all_documents)}")

# Test 5: Status toggle
# (Assumed pass if no errors)
print(f"\n5Ô∏è‚É£  Status Toggle: ‚úÖ PASS")

# Overall
all_pass = upload_pass and docs_table_pass and catalog_pass
print("\n" + "="*80)
if all_pass:
    print("üéâ ALL TESTS PASSED!")
    print("\n‚úÖ Upload pipeline is working correctly:")
    print("   - Files upload successfully")
    print("   - Documents table gets populated")
    print("   - Vector DB stores chunks")
    print("   - Catalog endpoint works")
    print("   - Status toggle works")
else:
    print("‚ö†Ô∏è  SOME TESTS FAILED")
    print("\nCheck failed tests above for details")
print("="*80)

## üîç Optional: Database State Check

In [None]:
print_section("Database State Overview")

# Documents table stats
query = """
SELECT 
    document_type,
    COUNT(*) as count,
    SUM(total_chunks) as total_chunks
FROM documents
GROUP BY document_type
ORDER BY count DESC
"""

stats = run_query(query)

print("üìä Documents Table:\n")
for _, row in stats.iterrows():
    print(f"   {row['document_type']}: {row['count']} docs, {row['total_chunks']} chunks")

# Vector DB stats
vector_query = """
SELECT 
    COUNT(DISTINCT cmetadata->>'document_id') as unique_docs,
    COUNT(*) as total_chunks
FROM langchain_pg_embedding
"""

vector_stats = run_query(vector_query)

print(f"\nüìä Vector DB:\n")
print(f"   Unique documents: {vector_stats.iloc[0]['unique_docs']}")
print(f"   Total chunks: {vector_stats.iloc[0]['total_chunks']}")

# Consistency check
docs_count = stats['count'].sum()
vector_count = vector_stats.iloc[0]['unique_docs']

print(f"\n‚úÖ Consistency: {'GOOD' if docs_count >= vector_count else 'NEEDS ATTENTION'}")
print(f"   Documents table: {docs_count}")
print(f"   Vector DB: {vector_count}")
print(f"   Difference: {docs_count - vector_count}")

## üîç Debug: Check Recent Upload Details

In [None]:
print_section("Debug: Recent Upload Analysis")

# Check if we have upload_id from previous cell
if 'upload_id' in locals() and upload_id:
    print(f"üîç Analyzing upload: {upload_id}\n")
    
    # Check documents table for recent inserts
    recent_query = """
    SELECT 
        document_id,
        document_name,
        file_name,
        document_type,
        category,
        total_chunks,
        status,
        created_at,
        EXTRACT(EPOCH FROM (NOW() - created_at)) as seconds_ago
    FROM documents
    WHERE created_at > NOW() - INTERVAL '10 minutes'
    ORDER BY created_at DESC
    LIMIT 10
    """
    
    recent_docs = run_query(recent_query)
    
    if recent_docs.empty:
        print("‚ùå No documents inserted in last 10 minutes")
        print("   This indicates documents table insert FAILED\n")
    else:
        print(f"‚úÖ Found {len(recent_docs)} recent documents:\n")
        for idx, row in recent_docs.iterrows():
            age_min = int(row['seconds_ago'] / 60)
            age_sec = int(row['seconds_ago'] % 60)
            print(f"[{idx+1}] {row['document_id']}")
            print(f"    Name: {row['document_name'][:60]}")
            print(f"    File: {row['file_name']}")
            print(f"    Type: {row['document_type']} | Category: {row['category']}")
            print(f"    Chunks: {row['total_chunks']} | Status: {row['status']}")
            print(f"    Age: {age_min}m {age_sec}s ago")
            print()
    
    # Check vector DB for recent chunks
    vector_check = """
    SELECT 
        cmetadata->>'document_id' as doc_id,
        cmetadata->>'document_name' as doc_name,
        cmetadata->>'file_name' as file_name,
        COUNT(*) as chunk_count,
        MIN(cmetadata->>'chunk_index') as first_chunk,
        MAX(cmetadata->>'chunk_index') as last_chunk
    FROM langchain_pg_embedding
    WHERE cmetadata->>'document_id' IN (
        SELECT document_id 
        FROM documents 
        WHERE created_at > NOW() - INTERVAL '10 minutes'
    )
    GROUP BY 
        cmetadata->>'document_id',
        cmetadata->>'document_name',
        cmetadata->>'file_name'
    ORDER BY chunk_count DESC
    """
    
    vector_docs = run_query(vector_check)
    
    if vector_docs.empty:
        print("‚ö†Ô∏è  No chunks found in vector DB for recent documents")
        print("   Either processing is still running or chunks insert failed\n")
    else:
        print(f"‚úÖ Vector DB has chunks for {len(vector_docs)} recent documents:\n")
        for idx, row in vector_docs.iterrows():
            print(f"[{idx+1}] {row['doc_id']}")
            print(f"    Name: {row['doc_name'][:60] if row['doc_name'] else 'N/A'}")
            print(f"    File: {row['file_name']}")
            print(f"    Chunks: {row['chunk_count']} (index {row['first_chunk']} to {row['last_chunk']})")
            print()
    
    # Cross-check: documents in table but not in vector DB
    if not recent_docs.empty and not vector_docs.empty:
        docs_ids = set(recent_docs['document_id'])
        vector_ids = set(vector_docs['doc_id'])
        
        missing_in_vector = docs_ids - vector_ids
        missing_in_docs = vector_ids - docs_ids
        
        if missing_in_vector:
            print(f"‚ö†Ô∏è  {len(missing_in_vector)} documents in table but NOT in vector DB:")
            for doc_id in missing_in_vector:
                print(f"   - {doc_id}")
            print()
        
        if missing_in_docs:
            print(f"‚ö†Ô∏è  {len(missing_in_docs)} documents in vector DB but NOT in table:")
            for doc_id in missing_in_docs:
                print(f"   - {doc_id}")
            print()
        
        if not missing_in_vector and not missing_in_docs:
            print("‚úÖ Perfect consistency: All documents have chunks in vector DB")
    
    # Summary
    print("\n" + "="*80)
    print("üìä Upload Status Summary:")
    print(f"   Upload ID: {upload_id}")
    print(f"   Documents table: {len(recent_docs)} recent entries")
    print(f"   Vector DB: {len(vector_docs)} documents with chunks")
    
    if not recent_docs.empty and not vector_docs.empty:
        print(f"   Status: ‚úÖ Both tables populated successfully")
    elif not recent_docs.empty:
        print(f"   Status: ‚ö†Ô∏è  Documents table OK, vector DB pending/failed")
    elif not vector_docs.empty:
        print(f"   Status: ‚ö†Ô∏è  Vector DB OK, documents table failed")
    else:
        print(f"   Status: ‚ùå Both inserts appear to have failed")
    print("="*80)
    
else:
    print("‚ö†Ô∏è  No upload_id available")
    print("   Run the upload cell (Test 1.1) first")

## üîç Debug: Check Server Logs

In [None]:
print_section("Debug: Recent Server Logs")

import subprocess

# Check latest server logs
log_file = project_root / "logs" / "server-log-deprecated.txt"

if log_file.exists():
    print(f"üìù Server log: {log_file}\n")
    
    # Get last 30 lines
    result = subprocess.run(
        ['tail', '-n', '30', str(log_file)],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        lines = result.stdout.strip().split('\n')
        
        # Filter for important lines
        print("üîç Recent activity (last 30 lines):\n")
        
        error_count = 0
        upload_count = 0
        
        for line in lines[-30:]:
            # Highlight errors
            if 'ERROR' in line:
                print(f"‚ùå {line}")
                error_count += 1
            # Show upload endpoints
            elif 'POST /api/upload/files' in line or 'POST /upload/files' in line:
                print(f"üì§ {line}")
                upload_count += 1
            # Show status checks
            elif 'GET /api/upload/status' in line or 'GET /upload/status' in line:
                print(f"üìä {line}")
            # Show important processing steps
            elif 'Single file processing' in line or 'Classified' in line:
                print(f"‚öôÔ∏è  {line}")
        
        print(f"\nüìä Summary:")
        print(f"   Upload requests: {upload_count}")
        print(f"   Errors: {error_count}")
        
        if error_count > 0:
            print(f"\n‚ö†Ô∏è  Found {error_count} errors - check details above")
        else:
            print(f"\n‚úÖ No errors in recent logs")
    else:
        print(f"‚ùå Failed to read log file: {result.stderr}")
else:
    print(f"‚ùå Log file not found: {log_file}")
    print(f"   Server may be logging elsewhere or not started")