In [1]:
from arango import ArangoClient


class GraphManager:
    def __init__(self, db_host, db_name, username, password, graph_name):
        self.client = ArangoClient(hosts=db_host, verify_override=False)
        # self.client = ArangoClient()
        self.db = self.client.db(db_name, username=username, password=password)

        # create the graph if it doesn't exist
        if not self.db.has_graph(graph_name):
            self.graph = self.db.create_graph(graph_name)
        else:
            self.graph = self.db.graph(graph_name)

        # create the nodes collection if it doesn't exist
        if not self.graph.has_vertex_collection("nodes"):
            self.nodes = self.graph.create_vertex_collection("nodes")
        else:
            self.nodes = self.graph.vertex_collection("nodes")

        # create the edges collection if it doesn't exist
        if not self.graph.has_edge_definition("edges"):
            self.edges = self.graph.create_edge_definition(
                edge_collection="edges",
                from_vertex_collections=["nodes"],
                to_vertex_collections=["nodes"],
            )
        else:
            self.edges = self.graph.edge_collection("edges")

    """ CRUD operations for nodes """

    def create_node(self, node):
        if self.nodes.has(node["_key"]):
            self.nodes.replace(node)
        else:
            self.nodes.insert(node)

    def read_node(self, key):
        return self.nodes.get(key)

    def update_node(self, node):
        if self.nodes.has(node["_key"]):
            self.nodes.update(node)
        else:
            raise ValueError(f"Node with key {node['_key']} does not exist.")

    def delete_node(self, key):
        if self.nodes.has(key):
            self.nodes.delete(key)
        else:
            raise ValueError(f"Node with key {key} does not exist.")

    # batch delete nodes
    def delete_nodes(self, keys):
        for key in keys:
            try:
                self.nodes.delete(key)
            except Exception as e:
                print(f"Warning: Could not delete node with key {key}. Reason: {e}")

    # delete all nodes
    def delete_all_nodes(self):
        self.nodes.truncate()

    # get nodes by keyword
    def get_nodes_by_keyword(self, keyword):
        query = """
        FOR node IN nodes
            FILTER node.keyword == @keyword
            RETURN node
        """
        bind_vars = {"keyword": keyword}
        cursor = self.db.aql.execute(query, bind_vars=bind_vars)
        return [doc for doc in cursor]

    # get nodes containing keyword
    def get_nodes_containing_keyword(self, keyword):
        query = """
        FOR node IN nodes
            FILTER CONTAINS(node.keyword, @keyword)
            RETURN node
        """
        bind_vars = {"keyword": keyword}
        cursor = self.db.aql.execute(query, bind_vars=bind_vars)
        return [doc for doc in cursor]

    """ CRUD operations for edges """

    def create_edge(self, edge):
        self.edges.insert(edge)

    def read_edge(self, key):
        return self.edges.get(key)

    def update_edge(self, edge):
        if self.edges.has(edge["_key"]):
            self.edges.update(edge)
        else:
            raise ValueError(f"Edge with key {edge['_key']} does not exist.")

    def delete_edge(self, key):
        if self.edges.has(key):
            self.edges.delete(key)
        else:
            raise ValueError(f"Edge with key {key} does not exist.")

    def delete_edges(self, keys):
        for key in keys:
            if self.edges.has(key):
                self.edges.delete(key)
            else:
                print(f"Warning: Edge with key {key} does not exist.")

    def delete_all_edges(self):
        self.edges.truncate()


    def get_all_nodes(self):
        return list(self.nodes.all())

    # get outgoing edges from a node
    def get_outgoing_edges(self, node_key):
        query = f"""
        FOR v, e IN OUTBOUND 'nodes/{node_key}' edges
        RETURN v
        """
        cursor = self.db.aql.execute(query)
        return [doc for doc in cursor]
    
    # get incoming edges to a node
    def get_incoming_edges(self, node_key):
        query = f"""
        FOR v, e IN INBOUND 'nodes/{node_key}' edges
        RETURN v
        """
        cursor = self.db.aql.execute(query)
        return [doc for doc in cursor]
    
    def get_outgoing_edges_of_all_nodes(self):
        all_nodes = self.get_all_nodes()
        all_outgoing_edges = {}
        for node in all_nodes:
            node_key = node["_key"]
            outgoing_edges = self.get_outgoing_edges(node_key)
            all_outgoing_edges[node_key] = outgoing_edges
        return all_outgoing_edges
    

In [2]:
# local - docker
graph_manager = GraphManager(
    db_host="http://localhost:8529",
    db_name="my_database",
    username="root",
    password="somepassword",
    graph_name="my_graph",
)

create nodes

In [14]:
# case 1-2-3
nodes_data = [
    {"_key": "A", "keyword": "กาแฟเบอร์ดี้ โรบัสต้า", "is_typo": False},
    {"_key": "B", "keyword": "อเมซ้อน", "is_typo": True},
    {"_key": "C", "keyword": "อเมซอน", "is_typo": False},
    {"_key": "D", "keyword": "กาแฟกระป๋อง", "is_typo": False},
    {"_key": "E", "keyword": "กาแฟป๋อง", "is_typo": True},
    {"_key": "F", "keyword": "amazon", "is_typo": False},
    {"_key": "G", "keyword": "amezon", "is_typo": True},
]
for node in nodes_data:
    graph_manager.create_node(node)



create edges

In [15]:
# case 1 : [กาแฟเบอร์ดี้ โรบัสต้า(False), อเมซ้อน(True), อเมซอน(False) => กาแฟกระป๋อง(False), กาแฟป๋อง(True)] : A, B, C => D, E 
edges_data = [
    {"_from": "nodes/A", "_to": "nodes/D"},
    {"_from": "nodes/A", "_to": "nodes/E"},
    {"_from": "nodes/B", "_to": "nodes/D"},
    {"_from": "nodes/B", "_to": "nodes/E"},
    {"_from": "nodes/C", "_to": "nodes/D"},
    {"_from": "nodes/C", "_to": "nodes/E"},
]
for edge in edges_data:
    graph_manager.create_edge(edge)



In [16]:
# case 2 : [กาแฟกระป๋อง(False), กาแฟป๋อง(True) => กาแฟเบอร์ดี้ โรบัสต้า(False), อเมซ้อน(True), อเมซอน(False)] : D, E => A, B, C
edges_data = [
    {"_from": "nodes/D", "_to": "nodes/A"},
    {"_from": "nodes/D", "_to": "nodes/B"},
    {"_from": "nodes/D", "_to": "nodes/C"},
    {"_from": "nodes/E", "_to": "nodes/A"},
    {"_from": "nodes/E", "_to": "nodes/B"},
    {"_from": "nodes/E", "_to": "nodes/C"},
]
for edge in edges_data:
    graph_manager.create_edge(edge)



In [193]:
# case 3 : [อเมซ้อน(True), อเมซอน(False), amazon(False), amezon(True)] : B, C, F, G
edges_data = [
    {"_from": "nodes/B", "_to": "nodes/C"},
    {"_from": "nodes/B", "_to": "nodes/F"},
    {"_from": "nodes/B", "_to": "nodes/G"},
    {"_from": "nodes/C", "_to": "nodes/B"},
    {"_from": "nodes/C", "_to": "nodes/F"},
    {"_from": "nodes/C", "_to": "nodes/G"},
    {"_from": "nodes/F", "_to": "nodes/B"},
    {"_from": "nodes/F", "_to": "nodes/C"},
    {"_from": "nodes/F", "_to": "nodes/G"},
    {"_from": "nodes/G", "_to": "nodes/B"},
    {"_from": "nodes/G", "_to": "nodes/C"},
    {"_from": "nodes/G", "_to": "nodes/F"},
]
for edge in edges_data:
    graph_manager.create_edge(edge)

In [173]:
graph_manager.read_node('A')

{'_key': 'A',
 '_id': 'nodes/A',
 '_rev': '_ir0GQYC---',
 'id': 1,
 'keyword': 'กาแฟเบอร์ดี้ โรบัสต้า',
 'is_typo': False}

delete nodes

In [16]:
node_keys_to_delete = ['A', 'B', 'C', 'D', 'E']
graph_manager.delete_nodes(node_keys_to_delete)

In [4]:
graph_manager.delete_all_nodes()



deletes edges

In [5]:
graph_manager.delete_all_edges()



incoming edges of node

In [169]:
graph_manager.get_incoming_edges('E')

[]

outgoing edges of node

In [174]:
graph_manager.get_outgoing_edges('A')

[{'_key': 'E',
  '_id': 'nodes/E',
  '_rev': '_ir0GQZe---',
  'id': 5,
  'keyword': 'กาแฟป๋อง',
  'is_typo': True},
 {'_key': 'D',
  '_id': 'nodes/D',
  '_rev': '_ir0GQZG---',
  'id': 4,
  'keyword': 'กาแฟกระป๋อง',
  'is_typo': False}]

In [175]:
graph_manager.get_outgoing_edges('D')

[]

In [4]:
existing_edges = graph_manager.edges.all()
[edge['_key'] for edge in existing_edges]

[]

In [178]:
graph_manager.read_edge('189611')

{'_key': '189611',
 '_id': 'edges/189611',
 '_from': 'nodes/A',
 '_to': 'nodes/D',
 '_rev': '_ir0GTSG---'}

In [6]:
outgoing_edges = graph_manager.get_outgoing_edges_of_all_nodes()
outgoing_edges

{'A': [{'_key': 'E',
   '_id': 'nodes/E',
   '_rev': '_i0avhRC---',
   'keyword': 'กาแฟป๋อง',
   'is_typo': True},
  {'_key': 'D',
   '_id': 'nodes/D',
   '_rev': '_i0avhQ6---',
   'keyword': 'กาแฟกระป๋อง',
   'is_typo': False}],
 'B': [{'_key': 'E',
   '_id': 'nodes/E',
   '_rev': '_i0avhRC---',
   'keyword': 'กาแฟป๋อง',
   'is_typo': True},
  {'_key': 'D',
   '_id': 'nodes/D',
   '_rev': '_i0avhQ6---',
   'keyword': 'กาแฟกระป๋อง',
   'is_typo': False}],
 'C': [{'_key': 'E',
   '_id': 'nodes/E',
   '_rev': '_i0avhRC---',
   'keyword': 'กาแฟป๋อง',
   'is_typo': True},
  {'_key': 'D',
   '_id': 'nodes/D',
   '_rev': '_i0avhQ6---',
   'keyword': 'กาแฟกระป๋อง',
   'is_typo': False}],
 'D': [],
 'E': [],
 'F': [],
 'G': []}

convert outgoing to dataframe

In [41]:
import pandas as pd

data = []

# Loop through each node and its edges
for from_node, edges in outgoing_edges.items():
    is_typo_from = graph_manager.read_node(from_node)['is_typo']
    for edge in edges:
        to_node = edge['_key']
        is_typo_to = edge['is_typo']
        
        # Append the data to the list
        data.append({
            '_from': from_node,
            'is_typo_from': is_typo_from,
            '_to': to_node,
            'is_typo_to': is_typo_to
        })

df = pd.DataFrame(data)

df

Unnamed: 0,_from,is_typo_from,_to,is_typo_to
0,A,False,E,True
1,A,False,D,False
2,B,True,E,True
3,B,True,D,False
4,C,False,E,True
5,C,False,D,False
6,D,False,A,False
7,E,True,A,False


index synonym

In [42]:
df.query('is_typo_from == False and is_typo_to == False')

Unnamed: 0,_from,is_typo_from,_to,is_typo_to
1,A,False,D,False
5,C,False,D,False
6,D,False,A,False


search synonym

In [43]:
filtered_df1 = df.query('is_typo_from == True and is_typo_to == False')
filtered_df2 = df.query('is_typo_from == False and is_typo_to == True')
filtered_df2 = filtered_df2.rename(columns={'_from': '_to', 'is_typo_from': 'is_typo_to', '_to': '_from', 'is_typo_to': 'is_typo_from'})

In [44]:
filtered_df1

Unnamed: 0,_from,is_typo_from,_to,is_typo_to
3,B,True,D,False
7,E,True,A,False


In [45]:
filtered_df2

Unnamed: 0,_to,is_typo_to,_from,is_typo_from
0,A,False,E,True
4,C,False,E,True


In [50]:
# Concatenate the filtered DataFrames
result_df = pd.concat([filtered_df1, filtered_df2])
result_df = result_df.drop_duplicates()
result_df

Unnamed: 0,_from,is_typo_from,_to,is_typo_to
3,B,True,D,False
7,E,True,A,False
4,E,True,C,False


naive elasticsearch's rule convertion

In [70]:
# Sample DataFrame
sdata = {
    '_from': ['B', 'E', 'B', 'D', 'G'],
    'is_typo_from': [True, True, True, False, False],
    '_to': ['D', 'A', 'F', 'B', 'A'],
    'is_typo_to': [False, False, True, True, False]
}

sdf = pd.DataFrame(sdata)

# Convert to Elasticsearch rule format
elasticsearch_rules = sdf.apply(lambda row: f"{row['_from']} => {row['_to']}", axis=1).tolist()

# Print the resulting rules
for rule in elasticsearch_rules:
    print(rule)

B => D
E => A
B => F
D => B
G => A


get nodes by keyword property

In [152]:
nodes = graph_manager.get_nodes_by_keyword("กาแฟเบอร์ดี้ โรบัสต้า")
for node in nodes:
    print(node)

{'_key': 'A', '_id': 'nodes/A', '_rev': '_ipFkMgq---', 'id': 1, 'keyword': 'กาแฟเบอร์ดี้ โรบัสต้า', 'is_typo': False}


In [276]:
nodes = graph_manager.get_nodes_by_keyword("กาแฟ")
for node in nodes:
    print(node)

{'_key': '5', '_id': 'nodes/5', '_rev': '_iteHVQi---', 'keyword': 'กาแฟ', 'is_typo': False}


get nodes there contain keyword

In [275]:
nodes = graph_manager.get_nodes_containing_keyword("กาแฟ")
for node in nodes:
    print(node)

{'_key': '5', '_id': 'nodes/5', '_rev': '_iteHVQi---', 'keyword': 'กาแฟ', 'is_typo': False}
{'_key': '7', '_id': 'nodes/7', '_rev': '_iteHVQu---', 'keyword': 'กาแฟป๋อง', 'is_typo': True}
{'_key': '8', '_id': 'nodes/8', '_rev': '_iteHVQ6---', 'keyword': 'กาแฟกระป๋อง', 'is_typo': False}
