Since concepts and beliefs were extracted from documents, there are duplicate Concept nodes, but with separate origins and meanings.
We're going to restructure the nodes to combine synonymous nodes into a new class apart from Belief and Concept.

In [None]:
import os
import logging
import sys
import pickle
import json
import time
from tqdm import tqdm
from neo4j import GraphDatabase
from dotenv import load_dotenv
from pinecone import Pinecone



# Load environment variables
load_dotenv()

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Pinecone setup
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east1-gcp")
PINECONE_INDEX_NAME = "belief-embeddings"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)

# Script Constants
CHECKPOINT_DIR = "checkpoints"
RESET_OVERLOADED_NODES = False

# Create checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Logging setup
LOG_FILE = "concept_processing.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")

# Initialize Neo4j Connection
def init_neo4j():
    if not NEO4J_URI:
        raise ValueError("⚠️ NEO4J_URI is not set. Check your .env file.")
    
    driver = GraphDatabase.driver(
        NEO4J_URI,
        auth=(NEO4J_USER, NEO4J_PASSWORD),
        max_connection_lifetime=3600
    )
    with driver.session() as session:
        if session.run("RETURN 1").single():
            logger.info("✅ Successfully connected to Neo4j")
            return driver
    raise Exception("❌ Failed to connect to Neo4J.")

In [None]:
def reset_overloaded_nodes(driver):
    if RESET_OVERLOADED_NODES:
        with driver.session() as session:
            result = session.run("MATCH (oc:Overloaded_Concept) DETACH DELETE oc RETURN count(*) as deleted")
            deleted = result.single()["deleted"]
            logger.info(f"❌ Deleted {deleted} Overloaded_Concept nodes.")

class ProcessTracker:
    def __init__(self):
        self.processed_ids = set()

    def mark_processed(self, vec_ids):
        if vec_ids:
            self.processed_ids.update(vec_ids)

def find_duplicate_concepts(driver):
    duplicate_concepts = {}
    with driver.session() as session:
        result = session.run(
            """
            MATCH (c:Concept)
            WITH c.concept as concept_name, collect(c) as concepts
            WHERE size(concepts) > 1
            RETURN concept_name, [c IN concepts | {
                id: c.id, 
                sep_id: c.sep_id, 
                document_title: c.document_title
            }] as concept_instances
            """
        )
        for record in result:
            concept_name = record["concept_name"]
            instances = record["concept_instances"]
            duplicate_concepts[concept_name] = instances
    logger.info(f"🔍 Found {len(duplicate_concepts)} concepts with duplicates")
    return duplicate_concepts

def merge_duplicate_concepts(driver, duplicate_concepts, tracker):
    total_merged = 0
    processed_ids = set()
    with tqdm(total=len(duplicate_concepts), desc="Merging duplicate concepts") as pbar:
        with driver.session() as session:
            for concept_name, instances in duplicate_concepts.items():
                try:
                    ids = [instance["id"] for instance in instances]
                    sep_ids = [instance["sep_id"] for instance in instances]
                    doc_titles = [instance["document_title"] for instance in instances]
                    session.run(
                        """
                        MERGE (oc:Overloaded_Concept {name: $name})
                        SET oc.type = "overloaded_concept",
                            oc.sep_ids = $sep_ids,
                            oc.document_titles = $doc_titles,
                            oc.vector_ids = $vector_ids
                        """,
                        name=concept_name,
                        sep_ids=sep_ids,
                        doc_titles=doc_titles,
                        vector_ids=ids
                    )
                    delete_result = session.run(
                        """
                        MATCH (c:Concept) 
                        WHERE c.id IN $ids
                        DETACH DELETE c
                        RETURN count(*) as deleted
                        """,
                        ids=ids
                    )
                    deleted = delete_result.single()["deleted"]
                    logger.info(f"🔄 Merged '{concept_name}' from {len(ids)} concepts, deleted {deleted} nodes")
                    processed_ids.update(ids)
                    total_merged += 1
                except Exception as e:
                    logger.error(f"❌ Error processing '{concept_name}': {str(e)}")
                pbar.update(1)
    tracker.mark_processed(list(processed_ids))
    logger.info(f"✅ Successfully merged {total_merged} duplicate concepts")
    return total_merged

In [None]:
def main():
    driver = init_neo4j()
    reset_overloaded_nodes(driver)
    tracker = ProcessTracker()
    try:
        logger.info("🔍 Searching for duplicate concepts across the database...")
        duplicate_concepts = find_duplicate_concepts(driver)
        if duplicate_concepts:
            merge_duplicate_concepts(driver, duplicate_concepts, tracker)
        else:
            logger.info("ℹ️ No duplicate concepts found in the database")
    except Exception as e:
        logger.error(f"❌ Unexpected error: {str(e)}")
    finally:
        driver.close()
        logger.info("✅ Script execution completed")

if __name__ == "__main__":
    main()


We need to do the same thing for Pinecone, and attach that to this. 

In [None]:
# Cell 1: Imports & Configurations

import os
import logging
import sys
import pickle
import json
import time
from tqdm import tqdm
from pinecone import Pinecone
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Pinecone Config
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east1-gcp")
PINECONE_INDEX_NAME = "belief-embeddings"

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Script Constants
FETCH_BATCH_SIZE = 100
CHECKPOINT_DIR = "checkpoints"
FAILED_UPLOADS_FILE = "failed_uploads.json"

# Toggles
RESET_PROCESSED_IDS = False  # Toggle to reset the processed IDs file

# Create checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Logging setup
LOG_FILE = "pinecone_overloaded_processing.log"

if RESET_PROCESSED_IDS:
    processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")
    if os.path.exists(processed_path):
        os.remove(processed_path)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")



In [None]:

# Cell 2: Initialize Pinecone & Neo4j Connections

def init_pinecone():
    """
    Initialize Pinecone connection and get index handle.
    """
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    index = pc.Index(PINECONE_INDEX_NAME)
    stats = index.describe_index_stats()
    logger.info(f"✅ Connected to Pinecone. Total vectors: {stats.get('total_vector_count', 0)}")
    return index, pc

def init_neo4j():
    """Initialize Neo4j connection with retry logic."""
    if not NEO4J_URI:
        raise ValueError("⚠️ NEO4J_URI is not set. Check your .env file.")

    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            driver = GraphDatabase.driver(
                NEO4J_URI,
                auth=(NEO4J_USER, NEO4J_PASSWORD),
                max_connection_lifetime=3600
            )
            with driver.session() as session:
                if session.run("RETURN 1").single():
                    logger.info("✅ Successfully connected to Neo4j")
                    return driver
        except Exception as e:
            logger.warning(f"⚠️ Connection attempt {attempt + 1} failed: {str(e)}")
            time.sleep(2 ** attempt)
    
    raise Exception("❌ Failed to connect to Neo4J after multiple attempts.")

index, pc = init_pinecone()
driver = init_neo4j()


In [None]:

# Cell 3: Process Tracker (Checkpoints)

class ProcessTracker:
    def __init__(self):
        self.processed_ids = set()
        self.failed_ids = set()
        self.load_checkpoint()

    def load_checkpoint(self):
        """Load processed IDs from disk unless reset is enabled."""
        processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")

        if not RESET_PROCESSED_IDS and os.path.exists(processed_path):
            try:
                with open(processed_path, "rb") as f:
                    self.processed_ids = pickle.load(f)
                logger.info(f"🔄 Loaded {len(self.processed_ids)} processed IDs.")
            except Exception as e:
                logger.error(f"⚠️ Error loading processed IDs: {str(e)}")

    def save_checkpoint(self):
        """Save processed IDs to disk."""
        processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")
        try:
            with open(processed_path, "wb") as f:
                pickle.dump(self.processed_ids, f)
            logger.info(f"✅ Saved {len(self.processed_ids)} processed IDs.")
        except Exception as e:
            logger.error(f"⚠️ Error saving processed IDs: {str(e)}")

    def mark_processed(self, vec_ids):
        self.processed_ids.update(vec_ids)
        self.save_checkpoint()

    def get_unprocessed(self, all_concepts, resume_fraction=0.95):
        """
        Returns only unprocessed overloaded concepts.
        Starts processing from `resume_fraction` to avoid reprocessing.
        """
        start_index = int(len(all_concepts) * resume_fraction)  # Resume 5/6 through
        return [
            concept for concept in all_concepts[start_index:]
            if not all(vid in self.processed_ids for vid in concept["vector_ids"])
        ]


tracker = ProcessTracker()

In [None]:

# Cell 4: Fetch Overloaded Concepts from Neo4j

def fetch_overloaded_concepts():
    """
    Fetch all Overloaded_Concept nodes from Neo4j.
    """
    overloaded_concepts = []
    with driver.session() as session:
        results = session.run(
            """
            MATCH (oc:Overloaded_Concept)
            RETURN oc.name AS name, oc.sep_ids AS sep_ids, oc.document_titles AS document_titles, oc.vector_ids AS vector_ids
            """
        )
        for record in results:
            overloaded_concepts.append({
                "name": record["name"],
                "sep_ids": record["sep_ids"],
                "document_titles": record["document_titles"],
                "vector_ids": record["vector_ids"]
            })
    
    return overloaded_concepts



In [None]:
import numpy as np 
# Cell 5: Process Overloaded Concepts in Pinecone

def process_overloaded_concepts(overloaded_concepts):
    """
    Process Overloaded_Concepts by selecting a master vector, creating a new merged vector,
    and deleting old redundant vectors.
    """
    if not overloaded_concepts:
        return

    failed_ids = []
    batch_size = 100
    batch_counter = 0

    for i in range(0, len(overloaded_concepts), batch_size):
        batch = overloaded_concepts[i : i + batch_size]
        new_vectors = []
        delete_vectors = []

        for concept in batch:
            vector_ids = concept["vector_ids"]
            sep_ids = concept["sep_ids"]
            document_titles = concept["document_titles"]
            name = concept["name"]

            # Fetch existing vectors from Pinecone
            response = index.fetch(vector_ids)
            valid_vectors = {vid: vdata for vid, vdata in response.vectors.items()}

            if not valid_vectors:
                logger.warning(f"⚠️ No valid vectors found for Overloaded_Concept: {name}")
                continue

            # Select the first vector as the master vector
            master_vector_id, master_vector_data = next(iter(valid_vectors.items()))
            master_embedding = master_vector_data.values  # Extract embedding from the master

            # Create new overloaded vector ID
            new_vector_id = f"{sep_ids[0]}_overloaded_concept_{hash(name) % 1000000}"

            new_metadata = {
                "concept": name,
                "sep_ids": [str(int(s)) for s in sep_ids],  # Convert to list of strings
                "document_titles": document_titles,
                "original_vector_ids": vector_ids
            }


            # Ensure master_embedding is a NumPy array before calling .tolist()
            if isinstance(master_embedding, list):
                master_embedding = np.array(master_embedding)  # Convert to NumPy array

            new_vectors.append((new_vector_id, master_embedding.tolist(), new_metadata))

            # Mark old vectors for deletion (only delete after successful upload)
            delete_vectors.extend(valid_vectors.keys())

        # Upload new vectors in batch
        if new_vectors:
            index.upsert(vectors=new_vectors)
            logger.info(f"✅ Uploaded {len(new_vectors)} new overloaded vectors.")
        # Delete old vectors in batches of 1000
        if delete_vectors:
            batch_size = 1000
            for j in range(0, len(delete_vectors), batch_size):
                batch_delete = delete_vectors[j : j + batch_size]
                index.delete(ids=batch_delete)
                logger.info(f"❌ Deleted {len(batch_delete)} redundant vectors. Total so far: {j + len(batch_delete)}/{len(delete_vectors)}")



        batch_counter += 1
        if batch_counter % 2 == 0:  # Log every 200 concepts
            logger.info(f"📢 Processed {batch_counter * batch_size} overloaded concepts so far.")

In [None]:

# Cell 6: Main Execution

def main():
    overloaded_concepts = fetch_overloaded_concepts()
    logger.info(f"📌 Found {len(overloaded_concepts)} overloaded concepts in Neo4j.")

    unprocessed_concepts = tracker.get_unprocessed(overloaded_concepts, resume_fraction=5/6)
    logger.info(f"🔎 {len(unprocessed_concepts)} overloaded concepts remain unprocessed.")

    process_overloaded_concepts(unprocessed_concepts)



if __name__ == "__main__":
    main()


In [None]:
response = index.query(vector=[0.1] * 768, top_k=1, include_metadata=True)

if response.matches:
    print("Found overloaded concept vector:", response.matches[0].id)
else:
    print("No overloaded vectors found.")


Some debug stuff. I need to get rid of some remaining redundant Pinecone vectors.

In [None]:
import os
import pickle
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Pinecone Index (assuming `index` is already initialized)
VECTOR_IDS_FILE = "vector_ids.pkl"

def fetch_and_save_vector_ids():
    """Fetch all vector IDs from Pinecone and save them."""
    logger.info("🔍 Fetching all vector IDs from Pinecone...")

    # Get total number of vectors
    response = index.describe_index_stats()
    total_vectors = response["total_vector_count"]
    logger.info(f"📌 Total vectors in Pinecone: {total_vectors}")

    # Fetch and flatten list of vector IDs
    raw_vector_ids = list(index.list())  # This may return nested lists
    vector_ids = [vid for sublist in raw_vector_ids for vid in (sublist if isinstance(sublist, list) else [sublist])]

    logger.info(f"✅ Retrieved {len(vector_ids)} vector IDs.")

    # Save to disk
    with open(VECTOR_IDS_FILE, "wb") as f:
        pickle.dump(vector_ids, f)
    logger.info(f"💾 Saved vector IDs to {VECTOR_IDS_FILE}.")

# Run first cell
fetch_and_save_vector_ids()


In [None]:
import pickle
import time
from collections import defaultdict

# Pinecone Index (assuming `index` is already initialized)
CHECKPOINT_FILE = "processed_vector_ids.pkl"
VECTOR_IDS_FILE = "vector_ids.pkl"
BATCH_SIZE = 500  # 🔧 Reduced batch size to avoid URI limits

# Load vector IDs from file
def load_vector_ids():
    """Load saved vector IDs from file."""
    with open(VECTOR_IDS_FILE, "rb") as f:
        return pickle.load(f)

# Load or initialize processed vector IDs
def load_checkpoint():
    """Load previously processed vector IDs to resume without duplication."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "rb") as f:
            return pickle.load(f)
    return set()

def save_checkpoint(processed_vectors):
    """Save processed vector IDs."""
    with open(CHECKPOINT_FILE, "wb") as f:
        pickle.dump(processed_vectors, f)

def count_redundant_vector_ids():
    """
    Identify and count redundant vector IDs in Pinecone.
    """
    vector_counts = defaultdict(int)  # Track occurrences of each vector ID
    processed_vectors = load_checkpoint()  # Load already processed IDs
    all_vector_ids = load_vector_ids()  # Load saved vector IDs

    logger.info(f"🔎 Total vector IDs: {len(all_vector_ids)}")

    # Filter out already processed vectors
    unprocessed_vector_ids = [vid for vid in all_vector_ids if vid not in processed_vectors]
    logger.info(f"🔎 {len(unprocessed_vector_ids)} vector IDs left to process.")

    # Process in batches
    for i in range(0, len(unprocessed_vector_ids), BATCH_SIZE):
        batch_ids = unprocessed_vector_ids[i : i + BATCH_SIZE]

        try:
            response = index.fetch(ids=batch_ids)  # Fetch metadata only
            for vector_id in response.vectors.keys():
                vector_counts[vector_id] += 1  # Count occurrences of each vector ID

            # Save progress
            processed_vectors.update(batch_ids)
            save_checkpoint(processed_vectors)

            logger.info(f"✅ Processed {i + len(batch_ids)}/{len(unprocessed_vector_ids)} vector IDs...")

        except Exception as e:
            logger.error(f"❌ Error processing batch {i//BATCH_SIZE + 1}: {str(e)}")
            time.sleep(2)  # Pause before retrying

    # Identify redundant vector IDs (appearing more than once)
    redundant_vector_ids = {k: v for k, v in vector_counts.items() if v > 1}
    logger.info(f"🔎 Found {len(redundant_vector_ids)} redundant vector IDs.")

    return redundant_vector_ids

# Run second cell
redundant_vector_ids = count_redundant_vector_ids()
