In [4]:
!pip install -q neo4j==5.19 pandas tqdm

from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm

# ============================================================
# CONFIG
# ============================================================
NEO4J_URI = "neo4j+s://dd4fd287.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "qaV1xGWFqrgK21kI-RGWl9hG0v2by1VCjnGrWwJFgN4"

BASE_DIR = "/content/drive/MyDrive/Synced/ProjectSchool2025/AugToNov/Dataset/EnrichmentReport"
SIM_LINKS_FILE = f"{BASE_DIR}/sim_links.csv"

# ============================================================
# CONNECT
# ============================================================
driver = GraphDatabase.driver(
    NEO4J_URI,
    auth=(NEO4J_USER, NEO4J_PASSWORD)
)

print("Connected to Neo4j Aura")

# ============================================================
# ENSURE CONSTRAINTS (SAFE / IDEMPOTENT)
# ============================================================
with driver.session() as s:
    s.run("""
    CREATE CONSTRAINT IF NOT EXISTS
    FOR (c:EntityCanonical)
    REQUIRE c.canonical_id IS UNIQUE
    """)
print("Constraints ensured")

# ============================================================
# LOAD SIMILARITY DATA
# ============================================================
sim_df = pd.read_csv(SIM_LINKS_FILE)

# Robust column detection
def pick(colnames):
    for c in sim_df.columns:
        for k in colnames:
            if k in c.lower():
                return c
    return None

src_col = pick(["src", "source"])
tgt_col = pick(["tgt", "target"])
score_col = pick(["score"])

if not src_col or not tgt_col or not score_col:
    raise RuntimeError(f"Unsupported sim_links schema: {sim_df.columns.tolist()}")

print("Using sim_links columns:", src_col, tgt_col, score_col)

# ============================================================
# INGEST SIMILARITY LINKS (RESUMABLE)
# ============================================================
with driver.session() as s:
    for _, r in tqdm(sim_df.iterrows(), total=len(sim_df), desc="SIMILAR_TO links"):
        s.run(
            """
            MATCH (a:EntityCanonical {canonical_id:$a})
            MATCH (b:EntityCanonical {canonical_id:$b})
            MERGE (a)-[rel:SIMILAR_TO]->(b)
            SET rel.score = $score
            """,
            a=str(r[src_col]),
            b=str(r[tgt_col]),
            score=float(r[score_col])
        )

print("Similarity ingestion completed")

# ============================================================
# INTEGRITY REPORT
# ============================================================
with driver.session() as s:
    stats = {
        "canonical_nodes": s.run(
            "MATCH (c:EntityCanonical) RETURN count(c) AS c"
        ).single()["c"],
        "similarity_links": s.run(
            "MATCH (:EntityCanonical)-[r:SIMILAR_TO]->(:EntityCanonical) RETURN count(r) AS c"
        ).single()["c"],
        "canonical_without_similarity": s.run(
            """
            MATCH (c:EntityCanonical)
            WHERE NOT (c)-[:SIMILAR_TO]->()
              AND NOT ()-[:SIMILAR_TO]->(c)
            RETURN count(c) AS c
            """
        ).single()["c"]
    }

print("\n===== INTEGRITY REPORT =====")
for k, v in stats.items():
    print(f"{k}: {v}")

driver.close()
print("\nContinuation pipeline completed cleanly")

Connected to Neo4j Aura
Constraints ensured
Using sim_links columns: src tgt score


SIMILAR_TO links: 100%|██████████| 21040/21040 [1:21:22<00:00,  4.31it/s]


Similarity ingestion completed

===== INTEGRITY REPORT =====
canonical_nodes: 33757
similarity_links: 21040
canonical_without_similarity: 23088

Continuation pipeline completed cleanly


In [None]:
import time
from google.colab import runtime

print("[STATUS] Sleeping 60s for Drive sync...")
time.sleep(60)

print("[STATUS] Disconnecting Runtime...")
runtime.unassign()
