# 01 Construct Graph Database

In this notebook, we will construct Graph Database before building our graph agent

In [2]:
import os
import duckdb
import numpy as np
import pandas as pd
from tqdm import tqdm
from langchain_neo4j import Neo4jGraph
from dotenv import load_dotenv

load_dotenv()

True

## Connecting Neo4j Database

In [5]:
graph = Neo4jGraph(
    url = "neo4j://127.0.0.1:7687",
    username = os.getenv("NEO4J_USER"),
    password = os.getenv("NEO4J_PASSWORD"),
    database = "chinook"
)

## Create a Graph Data Model

In [8]:
conn = duckdb.connect("../data/chinook.duckdb")
conn.sql("SELECT * FROM customers LIMIT 5").to_df()

Unnamed: 0,CustomerId,FirstName,LastName,Company,Address,City,State,Country,PostalCode,Phone,Fax,Email,SupportRepId
0,1,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
1,2,Leonie,Köhler,,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,+49 0711 2842222,,leonekohler@surfeu.de,5
2,3,François,Tremblay,,1498 rue Bélanger,Montréal,QC,Canada,H2G 1A7,+1 (514) 721-4711,,ftremblay@gmail.com,3
3,4,Bjørn,Hansen,,Ullevålsveien 14,Oslo,,Norway,0171,+47 22 44 22 22,,bjorn.hansen@yahoo.no,4
4,5,František,Wichterlová,JetBrains s.r.o.,Klanova 9/506,Prague,,Czech Republic,14700,+420 2 4172 5555,+420 2 4172 5555,frantisekw@jetbrains.com,4


### Create Node

In [9]:
employee_attr = ["EmployeeId", "LastName", "FirstName", "Title", "HireDate"]
customer_attr = ["CustomerId", "FirstName", "LastName", "Country"]
invoice_attr = ["InvoiceId", "InvoiceDate", "BillingCountry", "Total"]
invoice_item_attr = ["InvoiceId", "InvoiceLineId", "Quantity"]
track_attr = ["TrackId", "Name", "Bytes", "Milliseconds", "UnitPrice"]
artist_attr = ["ArtistId", "Name"]
album_attr = ["AlbumId", "Title", "ArtistId"]

In [10]:
# Employee Node
employee_attr_string = "{" + ", ".join(f"{attr}: ${attr}" for attr in employee_attr) + "}"
cypher = f"MERGE (: Employee {employee_attr_string})"
employee_df = conn.sql("select * from employees").to_df()

for _, row in tqdm(employee_df.iterrows()):
    graph.query(cypher, row.to_dict())

8it [00:01,  7.81it/s]


In [12]:
# Invoie Node
invoice_attr_string = "{" + ", ".join([f"{attr}: ${attr}" for attr in invoice_attr]) + "}"
cypher = f"MERGE (: Invoice {invoice_attr_string})"
invoice_df = conn.sql("select * from invoices").to_df()

for _, row in tqdm(invoice_df.iterrows()):
    graph.query(cypher, row.to_dict())
    graph.query("""
        MATCH (inv: Invoice {InvoiceId: $InvoiceId})
        MATCH (c: Customer {CustomerId: $CustomerId})
        MERGE (c)-[: MADE]-(inv)
    """, row.to_dict())

412it [00:08, 47.19it/s]


In [13]:
# Track Node
track_attr_string = "{" + ", ".join([f"{attr}: ${attr}" for attr in track_attr]) + "}"
cypher = f"MERGE (: Track {track_attr_string})"
track_df = conn.sql("select * from tracks").to_df()

for _, row in tqdm(track_df.iterrows()):
    graph.query(cypher, row.to_dict())

3503it [00:37, 93.53it/s] 


In [14]:
# InvoiceLine Node
invoice_item_attr_string = "{" + ", ".join([f"{attr}: ${attr}" for attr in invoice_item_attr]) + "}"
cypher = f"MERGE (: InvoiceItem {invoice_item_attr_string})"
invoice_item_df = conn.sql("select * from invoice_items").to_df()

for _, row in tqdm(invoice_item_df.iterrows()):
    graph.query(cypher, row.to_dict())
    graph.query("""
        MATCH (inv: Invoice {InvoiceId: $InvoiceId})
        MATCH (inv_item: InvoiceItem {InvoiceId: $InvoiceId, InvoiceLineId: $InvoiceLineId})
        MATCH (track: Track {TrackId: $TrackId})
        MERGE (inv)-[: HAS]-(inv_item)-[: FOR_TRACK]-(track)
    """, row.to_dict())

2240it [00:29, 75.98it/s] 


In [15]:
# Artist
artist_attr_string = "{" + ", ".join([f"{attr}: ${attr}" for attr in artist_attr]) + "}"
cypher = f"MERGE (: Artist {artist_attr_string})"
artist_df = conn.sql("select * from artists").to_df()

for _, row in tqdm(artist_df.iterrows()):
    graph.query(cypher, row.to_dict())

275it [00:01, 151.50it/s]


In [16]:
# Album
album_attr_string = "{" + ", ".join([f"{attr}: ${attr}" for attr in album_attr]) + "}"
cypher = f"MERGE (: Album {album_attr_string})"
album_df = conn.sql("select * from albums").to_df()

for _, row in tqdm(album_df.iterrows()):
    graph.query(cypher, row.to_dict())
    graph.query("""
        MATCH (artist: Artist {ArtistId: $ArtistId})
        MATCH (album: Album {AlbumId: $AlbumId})       
        MERGE (artist)-[: COMPOSED]-(album)
    """, row.to_dict())

for _, row in tqdm(track_df.iterrows()):
    #graph.query(cypher, row.to_dict())
    graph.query("""
        MATCH (track: Track {TrackId: $TrackId})
        MATCH (album: Album {AlbumId: $AlbumId})       
        MERGE (album)-[: CONTAINS]-(track)
    """, row.to_dict()) 

347it [00:04, 78.40it/s] 
3503it [00:23, 150.01it/s]


In [17]:
graph.refresh_schema()
print(graph.get_schema)

Node properties:
Customer {Country: STRING, CustomerId: INTEGER, FirstName: STRING, LastName: STRING}
Employee {FirstName: STRING, LastName: STRING, EmployeeId: INTEGER, HireDate: LOCAL_DATE_TIME, Title: STRING}
Invoice {InvoiceDate: LOCAL_DATE_TIME, InvoiceId: INTEGER, Total: FLOAT, BillingCountry: STRING}
InvoiceItem {InvoiceId: FLOAT, InvoiceLineId: FLOAT, Quantity: FLOAT}
Track {TrackId: INTEGER, UnitPrice: FLOAT, Bytes: INTEGER, Milliseconds: INTEGER, Name: STRING}
Artist {Name: STRING, ArtistId: INTEGER}
Album {Title: STRING, ArtistId: INTEGER, AlbumId: INTEGER}
Relationship properties:

The relationships:
(:Customer)-[:MADE]->(:Invoice)
(:Employee)-[:PROVIDE_SERVICE]->(:Customer)
(:Invoice)-[:HAS]->(:InvoiceItem)
(:InvoiceItem)-[:FOR_TRACK]->(:Track)
(:Artist)-[:COMPOSED]->(:Album)
(:Album)-[:CONTAINS]->(:Track)
