In [None]:
pip install "pycozo[embedded,pandas]"

In [None]:
#connect to cozo 
from pycozo.client import Client
import pandas as pd
from pycozo.client import Client
client = Client('rocksdb', 'ds30.db')

In [None]:
# create taggedComment subgraphs
import random
import pandas as pd

# HO-GDB imports
from HOGDB.db.neo4j import Neo4jDatabase
from HOGDB.graph.graph_with_tuple_storage import GraphwithTupleStorage
from HOGDB.graph.graph_with_subgraph_storage import GraphwithSubgraphStorage
from HOGDB.graph.node import Node
from HOGDB.graph.edge import Edge
from HOGDB.graph.subgraph import Subgraph

# reproducibility (optional)
random.seed(42)

# Example tag names (your list)
tag_names = [
    "Tech","Productivity","Coding","Design","RemoteWork","Motivation","Startups",
    "Learning","JavaScript","React","Career","Freelance","AI","Tech","Lifestyle",
    "Tech","OpenSource","BugFix","Tutorial","Networking","Tech","Innovation","Tips",
    "Resources","Tech","Tech","Software","Health","Travel","Inspiration","Projects",
    "Collaboration","Tech","Tech","Questions","Feedback","Goals","Books","Podcast",
    "Music","Tech","ProductLaunch"
]

# init HO-GDB -> Neo4j
db = Neo4jDatabase()
gs = GraphwithTupleStorage(db)           # used to add nodes & edges
gs_sub = GraphwithSubgraphStorage(db)    # used to add subgraphs

# fetch Cozo dataframes (assumes `client` is defined in your env)
edge_label_df = pd.DataFrame(client.run('?[id_e, ln] := *edge_label[id_e, ln]'))
edge_df       = pd.DataFrame(client.run('?[id_e, ns, nd] := *edge[id_e, ns, nd]'))
node_label_df = pd.DataFrame(client.run('?[id_n, ln] := *node_label[id_n, ln]'))

# filter hasTag edges where ns is comment and nd is tag
hasTag_edges = edge_label_df[edge_label_df['ln'] == 'hasTag'].merge(edge_df, on='id_e', how='inner')
hasTag_edges = hasTag_edges.merge(
    node_label_df.rename(columns={'id_n':'ns','ln':'ns_label'})[['ns','ns_label']],
    on='ns', how='left'
)
hasTag_edges = hasTag_edges.merge(
    node_label_df.rename(columns={'id_n':'nd','ln':'nd_label'})[['nd','nd_label']],
    on='nd', how='left'
)
hasTag_edges = hasTag_edges[(hasTag_edges['ns_label']=='comment') & (hasTag_edges['nd_label']=='tag')]

print(f"Found {len(hasTag_edges)} hasTag edges (comment -> tag).")

# start id for generated reified nodes
next_subgraph_id = 3000001

# For each matching row: create nodes, edge, then subgraph (no deduplication)
for i, row in hasTag_edges.iterrows():
    comment_id = int(row['ns'])
    tag_id     = int(row['nd'])
    edge_id    = int(row['id_e'])

    # create Node objects 
    n_comment = Node([Label("Comment")], [Property("id", int, comment_id)])
    # choose a random tag name for the tag node
    tag_name = random.choice(tag_names)
    n_tag = Node([Label("Tag")], [Property("id", int, tag_id), Property("name", str, tag_name)])

    # add nodes to DB
    gs.add_node(n_comment)
    gs.add_node(n_tag)

    # create and add the hasTag edge
    e_hasTag = Edge(n_comment, n_tag, Label("hasTag"), [])
    gs.add_edge(e_hasTag)

    # create subgraph object and store it
    sg = Subgraph(
        subgraph_nodes=[n_comment, n_tag],
        subgraph_edges=[e_hasTag],
        labels=[Label("taggedComment")],
        properties=[Property("id", int, next_subgraph_id)]
    )
    gs_sub.add_subgraph(sg)

    # increment id
    next_subgraph_id += 1

    # optional progress print every N items
    if (i+1) % 1000 == 0:
        print(f"Processed {i+1} subgraphs, next id {next_subgraph_id}")

# close connections
gs.close_connection()
gs_sub.close_connection()
print("Done: taggedComment subgraphs created !")
