In [4]:
# Cell 1: Setup - Fixed imports
import sys
from pathlib import Path

# Add src directory to path for imports
notebook_dir = Path().resolve()
src_dir = notebook_dir.parent / "src"
sys.path.insert(0, str(src_dir))

print(f"Notebook directory: {notebook_dir}")
print(f"Source directory: {src_dir}")
print(f"Source exists: {src_dir.exists()}")

import pandas as pd
import numpy as np

# Import our modules
from database.chromadb_manager import ChromaDBManager, load_publications_from_json
from utils.logger import setup_logger, get_logger

# Setup logging
setup_logger(level="INFO")
logger = get_logger(__name__)

print("🚀 ChromaDB Setup Test")
print("✅ All imports successful!")

Notebook directory: /home/santi/Projects/UBMI-IFC-Podcast/notebooks
Source directory: /home/santi/Projects/UBMI-IFC-Podcast/src
Source exists: True


ImportError: attempted relative import beyond top-level package

In [None]:
# Cell 2: Check if data exists
data_file = Path("../data/raw/all_ifc_publications.json")

if data_file.exists():
    print(f"✅ Found publication data: {data_file}")
    
    # Load publications
    publications = load_publications_from_json(str(data_file))
    print(f"📚 Loaded {len(publications)} publications")
    
    # Quick stats
    years = [pub.year for pub in publications]
    print(f"📅 Year range: {min(years)} - {max(years)}")
    print(f"📄 With abstracts: {sum(1 for pub in publications if pub.abstract)}")
else:
    print("❌ No publication data found. Please run the scraper first.")
    publications = []

In [None]:
# Cell 3: Initialize ChromaDB
if publications:
    print("🔧 Initializing ChromaDB Manager...")
    
    # Create ChromaDB manager
    db_manager = ChromaDBManager()
    
    # Create fresh collection
    print("📦 Creating collection...")
    db_manager.create_collection(reset=True)
    
    print("✅ ChromaDB initialized successfully!")
else:
    print("⚠️ Skipping ChromaDB setup - no publications loaded")

In [None]:
# Cell 4: Add publications to ChromaDB
if publications:
    print("🔄 Adding publications to ChromaDB...")
    print("This will take a few minutes to generate embeddings...")
    
    # Add publications (this generates embeddings)
    db_manager.add_publications(publications)
    
    # Check collection stats
    stats = db_manager.get_collection_stats()
    print("\n📊 Collection Statistics:")
    for key, value in stats.items():
        print(f"   {key}: {value}")
        
    print("\n✅ Publications successfully added to ChromaDB!")
else:
    print("⚠️ Skipping - no publications to add")

In [None]:
# Cell 5: Test similarity search
if publications:
    print("🔍 Testing similarity search...")
    
    # Test queries
    test_queries = [
        "neural mechanisms",
        "cardiac physiology",
        "protein structure",
        "gene expression",
        "brain activity"
    ]
    
    for query in test_queries:
        print(f"\n🔎 Query: '{query}'")
        results = db_manager.search_similar(query, n_results=3)
        
        if results['count'] > 0:
            for i, (doc, meta) in enumerate(zip(
                results['results']['documents'][0],
                results['results']['metadatas'][0]
            )):
                print(f"   {i+1}. {meta['title'][:60]}...")
                print(f"      Authors: {meta['authors'][:40]}...")
                print(f"      Year: {meta['year']}")
        else:
            print("   No results found")
    
    print("\n✅ Similarity search test completed!")
else:
    print("⚠️ Skipping search test - no data in ChromaDB")

In [None]:
# Cell 6: Export data for visualization
if publications:
    print("💾 Exporting embeddings for visualization...")
    
    output_path = db_manager.export_embeddings()
    print(f"📁 Exported to: {output_path}")
    
    # Load and display sample of exported data
    df = pd.read_csv(Path(output_path) / "publications_metadata.csv")
    print(f"\n📊 Exported DataFrame shape: {df.shape}")
    print("\nSample columns:")
    print(df.columns.tolist())
    print("\nFirst few rows:")
    print(df[['title', 'authors', 'year', 'has_abstract']].head())
    
    print("\n✅ Export completed!")
else:
    print("⚠️ Skipping export - no data available")