# Service Diagnostics

Run this notebook FIRST to verify all services are up and accessible.

In [None]:
# Configuration
NAMESPACE = "servicenow-ai-poc"
DB_HOST = "postgres-pgvector.servicenow-ai-poc.svc.cluster.local"
DB_PORT = "5432"
SERVICE_URL = "http://vector-search-service.servicenow-ai-poc.svc.cluster.local:8000"

## Test 1: DNS Resolution

In [None]:
import socket

# Test database DNS
print("Testing DNS resolution...\n")

services_to_test = [
    ("PostgreSQL", DB_HOST, int(DB_PORT)),
    ("Vector Search", "vector-search-service.servicenow-ai-poc.svc.cluster.local", 8000),
]

for name, host, port in services_to_test:
    print(f"Testing {name}: {host}:{port}")
    try:
        # Try DNS resolution
        ip = socket.gethostbyname(host)
        print(f"  DNS: SUCCESS - Resolved to {ip}")
        
        # Try TCP connection
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(5)
        result = sock.connect_ex((host, port))
        sock.close()
        
        if result == 0:
            print(f"  TCP:  SUCCESS - Port {port} is open")
        else:
            print(f"  TCP:  FAILED - Port {port} is closed or filtered")
            
    except socket.gaierror as e:
        print(f"  DNS:  FAILED - {e}")
    except Exception as e:
        print(f"  ERROR: {e}")
    print()

## Test 2: Check PostgreSQL (if DNS works)

In [None]:
try:
    import psycopg2
    
    DB_USER = "raguser"
    DB_PASSWORD = "0x8eight*"  # UPDATE THIS
    DB_NAME = "ragdb"
    
    print(f"Connecting to PostgreSQL: {DB_HOST}:{DB_PORT}/{DB_NAME}")
    
    conn = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT,
        user=DB_USER,
        password=DB_PASSWORD,
        database=DB_NAME,
        connect_timeout=10
    )
    
    cur = conn.cursor()
    cur.execute("SELECT version()")
    version = cur.fetchone()[0]
    
    print(f"SUCCESS: Connected to PostgreSQL")
    print(f"Version: {version}")
    
    # Check for required tables
    cur.execute("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
        ORDER BY table_name
    """)
    tables = [row[0] for row in cur.fetchall()]
    
    print(f"\nTables in database:")
    for table in tables:
        print(f"  - {table}")
    
    required_tables = ['collections', 'documents', 'embeddings']
    missing = [t for t in required_tables if t not in tables]
    
    if missing:
        print(f"\nWARNING: Missing required tables: {missing}")
    else:
        print(f"\nSUCCESS: All required tables exist")
    
    cur.close()
    conn.close()
    
except Exception as e:
    print(f"FAILED: {e}")
    import traceback
    traceback.print_exc()

## Test 3: Check Vector Search Service Health

In [None]:
import requests

print(f"Testing vector-search-service at: {SERVICE_URL}")

try:
    # Test CORRECT health endpoint (with /api/v1 prefix)
    response = requests.get(f"{SERVICE_URL}/api/v1/health", timeout=10)
    print(f"\nHealth check status: {response.status_code}")
    print(f"Response: {response.text}")
    
    if response.status_code == 200:
        print("SUCCESS: Service is up")
        # Try to parse JSON response
        try:
            health_data = response.json()
            print(f"Health data: {health_data}")
        except:
            pass
    else:
        print(f"WARNING: Unexpected status code")
        
except requests.exceptions.ConnectionError as e:
    print(f"FAILED: Cannot connect to service")
    print(f"Error: {e}")
    print("\nPossible causes:")
    print("  1. Service is not running")
    print("  2. Service crashed during startup")
    print("  3. Service can't connect to database")
    
except Exception as e:
    print(f"ERROR: {e}")
    import traceback
    traceback.print_exc()

## Test 4: Check if collections exist

In [None]:
import requests

try:
    response = requests.get(f"{SERVICE_URL}/api/v1/collections", timeout=10)
    
    if response.status_code == 200:
        collections = response.json()
        print(f"Collections: {collections}")
    else:
        print(f"Status: {response.status_code}")
        print(f"Response: {response.text}")
        
except Exception as e:
    print(f"Cannot list collections: {e}")

## Test 5: Use kubectl/oc to check pods (if available)

In [None]:
import subprocess

# Try to run oc command
try:
    result = subprocess.run(
        ["oc", "get", "pods", "-n", NAMESPACE],
        capture_output=True,
        text=True,
        timeout=10
    )
    
    if result.returncode == 0:
        print("Pods in namespace:")
        print(result.stdout)
    else:
        print("oc command failed or not available")
        print(result.stderr)
        
except FileNotFoundError:
    print("oc/kubectl command not available in this environment")
except Exception as e:
    print(f"Error: {e}")

## Summary

Based on the tests above:

**If DNS fails**: The database service doesn't exist or is in a different namespace

**If DNS works but TCP connection fails**: The service exists but isn't listening on that port

**If PostgreSQL connection fails**: Wrong credentials, or database not fully initialized

**If vector-search health check fails**: Service crashed during startup (usually due to database connection issues)

**Next steps:**
1. Fix any DNS/network issues first
2. Ensure PostgreSQL is running and has the correct schema
3. Restart vector-search-service if needed
4. Then try the ingestion notebook again

## Test 6: Check Pod Logs

This will show us what error caused the service to crash.

In [None]:
import subprocess

# Get pod logs to see what error caused the crash
try:
    result = subprocess.run(
        ["oc", "logs", "deployment/vector-search-service", "-n", NAMESPACE, "--tail=100"],
        capture_output=True,
        text=True,
        timeout=30
    )
    
    if result.returncode == 0:
        print("Recent vector-search-service logs:")
        print("=" * 80)
        print(result.stdout)
        print("=" * 80)
        
        # Look for error keywords
        if "error" in result.stdout.lower() or "exception" in result.stdout.lower():
            print("\n⚠️  ERRORS/EXCEPTIONS FOUND IN LOGS")
    else:
        print("Failed to get logs")
        print(result.stderr)
        
except FileNotFoundError:
    print("oc command not available in this environment")
except Exception as e:
    print(f"Error getting logs: {e}")