Constructs a nondirected similarity graph. 
Second LLM pass will add direction. 

In [None]:
import os
import logging
import sys
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

from tqdm import tqdm
from neo4j import GraphDatabase
from pinecone import Pinecone
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Pinecone setup
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = "belief-embeddings"

# Script Constants
CHECKPOINT_DIR = "checkpoints"
RESET_LOGS = False
RESET_CONNECTIONS = False
BATCH_SIZE = 100  
MAX_RETRIES = 5  
RETRY_DELAY = 5  

# Create checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Logging setup
LOG_FILE = "neo4j_pinecone_similarity.log"
if RESET_LOGS and os.path.exists(LOG_FILE):
    os.remove(LOG_FILE)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")


def init_neo4j():
    driver = GraphDatabase.driver(
        NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), max_connection_lifetime=3600
    )
    with driver.session() as session:
        if session.run("RETURN 1").single():
            logger.info("✅ Connected to Neo4j")
            return driver
    raise Exception("❌ Failed to connect to Neo4j.")

def reset_connections(driver):
    if RESET_CONNECTIONS:
        with driver.session() as session:
            while True:
                result = session.run("""
                    MATCH ()-[r:SIMILAR_TO]->() 
                    WITH r LIMIT 10000
                    DELETE r 
                    RETURN count(r) as deleted
                """)
                deleted = result.single()["deleted"]
                if deleted == 0:
                    break  
                logger.info(f"❌ Deleted {deleted} relationships...")

def init_pinecone():
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX_NAME)
    stats = index.describe_index_stats()
    logger.info(f"✅ Connected to Pinecone. Total vectors: {stats.get('total_vector_count', 0)}")
    return index

def fetch_nodes(driver, node_type):
    with driver.session() as session:
        result = session.run(f"MATCH (n:{node_type}) RETURN n.id AS id, n.sep_id AS sep_id")
        return [(record["id"], record["sep_id"]) for record in result]

def fetch_all_concepts(driver):
    with driver.session() as session:
        result = session.run("""
            MATCH (c:Concept) RETURN c.id AS id, c.sep_id AS sep_id
            UNION
            MATCH (c:Overloaded_Concept) RETURN c.id AS id, c.sep_id AS sep_id
        """)
        return [(record["id"], record["sep_id"]) for record in result]

def query_pinecone(index, node_id, sep_id):
    retries = 3  # Number of retries

    for attempt in range(retries):
        # Query for beliefs **with metadata filtering**
        belief_response = index.query(
            id=node_id,  
            top_k=20,  
            include_values=False,
            include_metadata=True,
            filter={"type": "belief", "sep_id": {"$ne": sep_id}}  # Only beliefs, exclude same SEP
        )

        belief_matches = [
            (match["id"], match["score"]) for match in belief_response.get("matches", [])
        ][:1]  # Ensure exactly one belief

        # Query for concepts **with metadata filtering**
        concept_response = index.query(
            id=node_id,  
            top_k=20,  
            include_values=False,
            include_metadata=True,
            filter={"type": {"$in": ["concept", "overloaded_concept"]}, "sep_id": {"$ne": sep_id}}  
        )

        concept_matches = [
            (match["id"], match["score"]) for match in concept_response.get("matches", [])
        ][:1]  # Ensure exactly one concept

        # Ensure exactly one belief and one concept
        if belief_matches and concept_matches:
            return belief_matches + concept_matches  

        logger.warning(f"⚠️ Retry {attempt + 1}/{retries}: No valid belief or concept found for {node_id}")

    logger.error(f"❌ Failed to find valid connections for {node_id} after {retries} attempts")
    return belief_matches + concept_matches  # Return whatever was found (could be empty)



def add_relationships(driver, node_id, relationships):
    if not relationships or len(relationships) < 2:
        logger.error(f"❌ Node {node_id} only received {len(relationships)} connections!")
        return  

    formatted_relationships = [
        {"idA": node_id, "idB": rel[0], "score": rel[1]} for rel in relationships
    ]

    attempt = 0
    while attempt < MAX_RETRIES:
        try:
            with driver.session() as session:
                session.execute_write(
                    lambda tx: tx.run(
                        """
                        UNWIND $data AS rel
                        MATCH (a {id: rel.idA})
                        MATCH (b {id: rel.idB})
                        MERGE (a)-[:SIMILAR_TO {weight: rel.score}]->(b)
                        """,
                        data=formatted_relationships
                    )
                )
            return  
        except Exception as e:
            logger.error(f"⚠️ Error inserting relationships (attempt {attempt + 1}): {e}")
            attempt += 1
            time.sleep(2 ** attempt)  

def process_nodes(driver, index, node_type):
    logger.info(f"🔄 Fetching all {node_type} nodes...")
    node_data = fetch_nodes(driver, node_type) if node_type == "Belief" else fetch_all_concepts(driver)
    
    def process_node(node_id, sep_id):
        connections = query_pinecone(index, node_id, sep_id)
        add_relationships(driver, node_id, connections)
        return node_id  

    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_node, node, sep): node for node, sep in node_data}
        
        for future in as_completed(futures):
            node_id = futures[future]
            logger.info(f"📢 Processed node: {node_id}")

def main():
    driver = init_neo4j()
    index = init_pinecone()
    reset_connections(driver)

    process_nodes(driver, index, "Belief")
    process_nodes(driver, index, "Concept")

    driver.close()
    logger.info("✅ Processing complete.")

if __name__ == "__main__":
    main()

In [None]:
import os
import logging
import sys
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Logging setup
LOG_FILE = "unconnected_nodes.log"
if os.path.exists(LOG_FILE):
    os.remove(LOG_FILE)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")

def init_neo4j():
    """Initialize Neo4j connection."""
    driver = GraphDatabase.driver(
        NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), max_connection_lifetime=3600
    )
    with driver.session() as session:
        if session.run("RETURN 1").single():
            logger.info("✅ Connected to Neo4j")
            return driver
    raise Exception("❌ Failed to connect to Neo4j.")

def fetch_unconnected_nodes(driver):
    """Retrieve all nodes that have no connections in Neo4j."""
    with driver.session() as session:
        result = session.run("""
            MATCH (n)
            WHERE NOT (n)-[]-()
            RETURN n.id AS id
        """)
        unconnected_nodes = [record["id"] for record in result]
    
    logger.info(f"🔎 Found {len(unconnected_nodes)} unconnected nodes.")
    return unconnected_nodes

def main():
    """Main function to find unconnected nodes."""
    driver = init_neo4j()
    unconnected_nodes = fetch_unconnected_nodes(driver)

    # Print some unconnected nodes
    for node_id in unconnected_nodes[:10]:  # Preview first 10
        print(f"Unconnected Node: {node_id}")

    # Save all unconnected nodes to a file
    with open("unconnected_nodes.txt", "w") as f:
        for node_id in unconnected_nodes:
            f.write(f"{node_id}\n")

    driver.close()
    logger.info("✅ Finished checking for unconnected nodes.")

if __name__ == "__main__":
    main()


In [None]:
import os
import logging
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

from neo4j import GraphDatabase
from pinecone import Pinecone
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Pinecone setup
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = "belief-embeddings"

# Script Constants
MAX_RETRIES = 5  
RETRY_DELAY = 5  

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")


def init_neo4j():
    """Initialize Neo4j connection."""
    driver = GraphDatabase.driver(
        NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), max_connection_lifetime=3600
    )
    with driver.session() as session:
        if session.run("RETURN 1").single():
            logger.info("✅ Connected to Neo4j")
            return driver
    raise Exception("❌ Failed to connect to Neo4j.")


def init_pinecone():
    """Initialize Pinecone connection."""
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX_NAME)
    stats = index.describe_index_stats()
    logger.info(f"✅ Connected to Pinecone. Total vectors: {stats.get('total_vector_count', 0)}")
    return index


def fetch_unconnected_concepts(driver):
    """Fetch all unconnected concepts and overloaded concepts from Neo4j."""
    query = """
        MATCH (c)
        WHERE (c:Concept OR c:Overloaded_Concept) AND NOT (c)-[:SIMILAR_TO]-()
        RETURN c.id AS id, c.sep_ids AS sep_ids, c.type AS type
    """
    with driver.session() as session:
        result = session.run(query)
        nodes = [(record["id"], record["sep_ids"], record["type"]) for record in result]

    logger.info(f"🔍 Found {len(nodes)} unconnected concepts and overloaded concepts.")
    return nodes


def query_pinecone(index, node_id, sep_ids, node_type):
    """Query Pinecone for related nodes with proper filtering."""
    if not sep_ids or not isinstance(sep_ids, list):
        logger.warning(f"⚠️ Node {node_id} has invalid sep_ids: {sep_ids}. Skipping query.")
        return []

    retries = 3  
    for attempt in range(retries):
        try:
            # Query Pinecone for similar concepts (including overloaded concepts)
            response = index.query(
                id=node_id,
                top_k=10,  
                include_values=False,
                include_metadata=True,
                filter={
                    "type": {"$in": ["concept", "overloaded_concept"]}, 
                    "sep_id": {"$nin": sep_ids}  # Ensure we exclude all sep_ids in overloaded concepts
                }
            )

            matches = [
                (match["id"], match["score"]) for match in response.get("matches", [])
                if "id" in match and "score" in match and match["id"] != node_id
            ]

            if matches:
                logger.info(f"🔗 Found {len(matches)} connections for {node_id}: {matches}")
                return matches[:2]  # Ensure at least 2 matches

            logger.warning(f"⚠️ Retry {attempt + 1}/{retries}: No valid connections found for {node_id}")
            time.sleep(2 ** attempt)
        except Exception as e:
            logger.error(f"❌ Error querying Pinecone for {node_id}: {str(e)}")

    logger.error(f"❌ Failed to find connections for {node_id} after {retries} attempts")
    return []


def add_relationships(driver, node_id, relationships):
    """Add SIMILAR_TO relationships in Neo4j."""
    if not relationships:
        logger.error(f"❌ Node {node_id} received no connections! Skipping.")
        return  

    formatted_relationships = [{"idA": node_id, "idB": rel[0], "score": rel[1]} for rel in relationships]

    attempt = 0
    while attempt < MAX_RETRIES:
        try:
            with driver.session() as session:
                session.execute_write(
                    lambda tx: tx.run(
                        """
                        UNWIND $data AS rel
                        MATCH (a {id: rel.idA})
                        MATCH (b {id: rel.idB})
                        MERGE (a)-[:SIMILAR_TO {weight: rel.score}]->(b)
                        """,
                        data=formatted_relationships
                    )
                )
            logger.info(f"✅ Added {len(relationships)} connections for {node_id}")
            return  
        except Exception as e:
            logger.error(f"⚠️ Error inserting relationships (attempt {attempt + 1}): {e}")
            attempt += 1
            time.sleep(2 ** attempt)  


def process_unconnected_concepts(driver, index):
    """Process all unconnected concepts and overloaded concepts and create new connections."""
    nodes = fetch_unconnected_concepts(driver)

    def process_node(node_id, sep_ids, node_type):
        try:
            logger.info(f"🔍 Starting to process node: {node_id}")
            connections = query_pinecone(index, node_id, sep_ids, node_type)
            if connections:
                add_relationships(driver, node_id, connections)
                logger.info(f"✅ Successfully created connections for {node_id}")
                return (node_id, True)
            else:
                logger.warning(f"⚠️ No connections found for {node_id}")
                return (node_id, False)
        except Exception as e:
            logger.error(f"❌ Error processing {node_id}: {str(e)}")
            return (node_id, False)

    success_count = 0
    failure_count = 0
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_node = {executor.submit(process_node, node, sep, typ): (node, sep, typ) for node, sep, typ in nodes}
        
        for future in as_completed(future_to_node):
            node_info = future_to_node[future]
            try:
                node_id, success = future.result()
                if success:
                    success_count += 1
                    logger.info(f"📢 Successfully processed node: {node_id} ({success_count}/{len(nodes)})")
                else:
                    failure_count += 1
                    logger.warning(f"⚠️ Failed to process node: {node_info[0]} ({failure_count} failures so far)")
            except Exception as e:
                failure_count += 1
                logger.error(f"❌ Exception in future for node {node_info[0]}: {str(e)}")
    
    logger.info(f"📊 Summary: Successfully processed {success_count} nodes, {failure_count} failures")


def main():
    driver = init_neo4j()
    index = init_pinecone()

    process_unconnected_concepts(driver, index)

    driver.close()
    logger.info("✅ Processing complete.")


if __name__ == "__main__":
    main()
