In [None]:
#!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl (611 kB)
     ---------------------------------------- 0.0/611.1 kB ? eta -:--:--
     - ------------------------------------- 30.7/611.1 kB 1.4 MB/s eta 0:00:01
     - ------------------------------------- 30.7/611.1 kB 1.4 MB/s eta 0:00:01
     ----- ------------------------------- 92.2/611.1 kB 751.6 kB/s eta 0:00:01
     ----- ------------------------------- 92.2/611.1 kB 751.6 kB/s eta 0:00:01
     --------- -------------------------- 163.8/611.1 kB 756.6 kB/s eta 0:00:01
     ------------- ---------------------- 225.3/611.1 kB 986.4 kB/s eta 0:00:01
     ----------------------------- -------- 481.3/611.1 kB 1.6 MB/s eta 0:00:01
     ---------------------------------- --- 553.0/611.1 kB 1.5 MB/s eta 0:00:01
     -------------------------------------- 611.1/611.1 kB 1.6 MB/s eta 0:00:00
Collecting opentelemetry-instrumentation-fastapi>=0.41b0
  Downloading opentelemetry_instrumentation_fastapi-0.52b1-py3-none-any.whl


[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Just do the required functions

In [None]:
import json
import chromadb
from typing import Dict, Any


def load_json_from_file(file_path: str) -> Dict[str, Any]:
    """Load JSON data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        raise Exception(f"File not found: {file_path}")
    except json.JSONDecodeError:
        raise Exception(f"Invalid JSON format: {file_path}")


class VectorDatabase:
    """Handles operations with Chroma vector database."""

    def __init__(self, storage_path: str = "./vector_storage"):
        self.client = chromadb.PersistentClient(path=storage_path)
        self.collection = self.client.get_or_create_collection(name="media_vectors")

    def add_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Add structured chunked data (with text + vector) to the DB."""
        try:
            base_id = f"{data['type']}_{data['scope']}_{hash(data['original_data'])}"

            for idx, chunk in enumerate(data.get("chunks", [])):
                chunk_text = chunk.get("text", "")
                chunk_embedding = chunk.get("embedding", [])

                if not chunk_embedding or not isinstance(chunk_embedding, list):
                    continue

                chunk_id = f"{base_id}_chunk_{idx}"
                chunk_metadata = {
                    "type": data["type"],
                    "scope": data["scope"],
                    "original_data": data["original_data"],
                    "timestamp": chunk.get("timestamp", ""),
                    "chunk_index": idx
                }

                self.collection.add(
                    embeddings=[chunk_embedding],
                    documents=[chunk_text],
                    metadatas=[chunk_metadata],
                    ids=[chunk_id]
                )

            return {"status": "success", "document_id": base_id}

        except Exception as e:
            return {"status": "error", "message": str(e)}


def main():
    db = VectorDatabase()
    file_path = 'json_output/Computational Intelligence and Neuroscience - 2018 - Voulodimos - Deep Learning for Computer Vision A Brief Review_20250331_203813.json'

    try:
        data = load_json_from_file(file_path)
        result = db.add_data(data)
        print(json.dumps({
            "status": result.get("status"),
            "document_id": result.get("document_id")
        }))
    except Exception as e:
        print(json.dumps({
            "status": "error",
            "message": str(e)
        }))


if __name__ == "__main__":
    main()


# Show the information in the database

In [2]:
import json
import chromadb
from typing import Dict, Any


def load_json_from_file(file_path: str) -> Dict[str, Any]:
    """Load JSON data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        raise Exception(f"File not found: {file_path}")
    except json.JSONDecodeError:
        raise Exception(f"Invalid JSON format: {file_path}")


class VectorDatabase:
    """Handles operations with Chroma vector database."""

    def __init__(self, storage_path: str = "./vector_storage"):
        self.client = chromadb.PersistentClient(path=storage_path)
        self.collection = self.client.get_or_create_collection(name="media_vectors")

    def add_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Add structured chunked data (with text + vector) to the DB."""
        try:
            base_id = f"{data['type']}_{data['scope']}_{hash(data['original_data'])}"

            for idx, chunk in enumerate(data.get("chunks", [])):
                chunk_text = chunk.get("text", "")
                chunk_embedding = chunk.get("embedding", [])

                if not chunk_embedding or not isinstance(chunk_embedding, list):
                    continue

                chunk_id = f"{base_id}_chunk_{idx}"
                chunk_metadata = {
                    "type": data["type"],
                    "scope": data["scope"],
                    "original_data": data["original_data"],
                    "timestamp": chunk.get("timestamp", ""),
                    "chunk_index": idx
                }

                self.collection.add(
                    embeddings=[chunk_embedding],
                    documents=[chunk_text],
                    metadatas=[chunk_metadata],
                    ids=[chunk_id]
                )

            return {"status": "success", "document_id": base_id}

        except Exception as e:
            return {"status": "error", "message": str(e)}

    def show_contents(self) -> None:
        """Print all stored documents in the DB, including their full embedding vectors."""
        results = self.collection.get(include=['documents', 'metadatas', 'embeddings'])
        total = len(results["ids"])
        print(f"\nTotal items in database: {total}")
        for i, (id_, doc, meta, embed) in enumerate(zip(
            results["ids"], results["documents"], results["metadatas"], results["embeddings"]
        )):
            print(f"\nItem {i + 1}:")
            print(f"  ID: {id_}")
            print(f"  Text: {doc}")
            print(f"  Metadata: {json.dumps(meta)}")
            print(f"  Embedding Vector: {embed}")


def main():
    db = VectorDatabase()

    file_path = 'json_output/Computational Intelligence and Neuroscience - 2018 - Voulodimos - Deep Learning for Computer Vision A Brief Review_20250331_203813.json'

    try:
        data = load_json_from_file(file_path)
        result = db.add_data(data)
        print(json.dumps(result, indent=2))
        db.show_contents()
    except Exception as e:
        print(json.dumps({"status": "error", "message": str(e)}))


if __name__ == "__main__":
    main()


Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_0
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_0
Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_1
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_1
Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_2
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_2
Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_3
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_3
Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_4
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_4
Add of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_5
Insert of existing embedding ID: pdf_research_paper_2467419777865860053_chunk_5
Add of existing embedding ID: pdf_research_paper_246741977

{
  "status": "success",
  "document_id": "pdf_research_paper_2467419777865860053"
}

Total items in database: 46

Item 1:
  ID: pdf_research_paper_2467419777865860053_chunk_0
  Text: Review Article Deep Learning for Computer Vision: A Brief Review Athanasios Voulodimos ,1,2 Nikolaos Doulamis,2 Anastasios Doulamis,2 and Eftychios Protopapadakis2 1Department of Informatics, Technological Educational Institute of Athens, 12210 Athens, Greece 2National Technical University of Athens, 15780 Athens, Greece Correspondence should be addressed to Athanasios Voulodimos; thanosv@mail.ntua.gr Received 17 June 2017; Accepted 27 November 2017; Published 1 February 2018 Academic Editor: Diego Andina Copyright © 2018 Athanasios Voulodimos et al. This is an open access article distributed under the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. Over the last years deep learning methods ha