# Document Ingestion

Ingest one or more files into PostgreSQL + vector storage using the project chunker.

In [1]:
from pathlib import Path
import os
import sys

ROOT = Path.cwd().resolve()
if (ROOT / 'backend').exists():
    PROJECT_ROOT = ROOT
elif ROOT.name == 'notebook':
    PROJECT_ROOT = ROOT.parent
else:
    PROJECT_ROOT = ROOT

BACKEND_DIR = PROJECT_ROOT / 'backend'
DATA_DIR = PROJECT_ROOT / 'Data'
if str(BACKEND_DIR) not in sys.path:
    sys.path.insert(0, str(BACKEND_DIR))

os.chdir(BACKEND_DIR)
print('Data directory:', DATA_DIR)

Data directory: C:\Researchaiagent\Data


In [2]:
from app.core.database import db_manager
from app.services.ingestion import chunker

await db_manager.init_postgres()
db_manager.init_neo4j()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Option A: set explicit file paths
FILES_TO_INGEST = [
    # str(DATA_DIR / 'sample.pdf'),
]

# Option B: auto-discover supported files in Data/
if not FILES_TO_INGEST:
    FILES_TO_INGEST = [
        str(p)
        for p in DATA_DIR.rglob('*')
        if p.suffix.lower() in {'.pdf', '.txt', '.docx'}
    ]

print(f'Files selected: {len(FILES_TO_INGEST)}')
FILES_TO_INGEST[:10]

Files selected: 10


['C:\\Researchaiagent\\Data\\A_New_Swarm_Intelligence_Coordination_Model_Inspir.pdf',
 'C:\\Researchaiagent\\Data\\Graph_databases.pdf',
 'C:\\Researchaiagent\\Data\\Holographic_data_storage_technology.pdf',
 'C:\\Researchaiagent\\Data\\Holographic_Storage.pdf',
 'C:\\Researchaiagent\\Data\\Large_language_Model.pdf',
 'C:\\Researchaiagent\\Data\\Neuromorphic_Memory.pdf',
 'C:\\Researchaiagent\\Data\\quantum inspired Retrival,searching.pdf',
 'C:\\Researchaiagent\\Data\\Speculative_RAG.pdf',
 'C:\\Researchaiagent\\Data\\Survey_SwarmIntelligence.pdf',
 'C:\\Researchaiagent\\Data\\Temporial_Knowledge_graph.pdf']

In [4]:
ingestion_results = []

for fp in FILES_TO_INGEST:
    try:
        doc_id = await chunker.process_document(fp)
        ingestion_results.append({'file': fp, 'status': 'ok', 'document_id': doc_id})
        print(f'OK: {fp} -> document_id={doc_id}')
    except Exception as exc:
        ingestion_results.append({'file': fp, 'status': 'failed', 'error': str(exc)})
        print(f'FAILED: {fp} -> {exc}')

ingestion_results

OK: C:\Researchaiagent\Data\A_New_Swarm_Intelligence_Coordination_Model_Inspir.pdf -> document_id=41
OK: C:\Researchaiagent\Data\Graph_databases.pdf -> document_id=42
OK: C:\Researchaiagent\Data\Holographic_data_storage_technology.pdf -> document_id=43
OK: C:\Researchaiagent\Data\Holographic_Storage.pdf -> document_id=44
OK: C:\Researchaiagent\Data\Large_language_Model.pdf -> document_id=45
OK: C:\Researchaiagent\Data\Neuromorphic_Memory.pdf -> document_id=46
OK: C:\Researchaiagent\Data\quantum inspired Retrival,searching.pdf -> document_id=47
OK: C:\Researchaiagent\Data\Speculative_RAG.pdf -> document_id=48
OK: C:\Researchaiagent\Data\Survey_SwarmIntelligence.pdf -> document_id=49
FAILED: C:\Researchaiagent\Data\Temporial_Knowledge_graph.pdf -> invalid byte sequence for encoding "UTF8": 0x00


[{'file': 'C:\\Researchaiagent\\Data\\A_New_Swarm_Intelligence_Coordination_Model_Inspir.pdf',
  'status': 'ok',
  'document_id': 41},
 {'file': 'C:\\Researchaiagent\\Data\\Graph_databases.pdf',
  'status': 'ok',
  'document_id': 42},
 {'file': 'C:\\Researchaiagent\\Data\\Holographic_data_storage_technology.pdf',
  'status': 'ok',
  'document_id': 43},
 {'file': 'C:\\Researchaiagent\\Data\\Holographic_Storage.pdf',
  'status': 'ok',
  'document_id': 44},
 {'file': 'C:\\Researchaiagent\\Data\\Large_language_Model.pdf',
  'status': 'ok',
  'document_id': 45},
 {'file': 'C:\\Researchaiagent\\Data\\Neuromorphic_Memory.pdf',
  'status': 'ok',
  'document_id': 46},
 {'file': 'C:\\Researchaiagent\\Data\\quantum inspired Retrival,searching.pdf',
  'status': 'ok',
  'document_id': 47},
 {'file': 'C:\\Researchaiagent\\Data\\Speculative_RAG.pdf',
  'status': 'ok',
  'document_id': 48},
 {'file': 'C:\\Researchaiagent\\Data\\Survey_SwarmIntelligence.pdf',
  'status': 'ok',
  'document_id': 49},
 {'

In [5]:
await db_manager.close()