In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import datetime
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv

load_dotenv()
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

driver = GraphDatabase.driver(uri, auth=(user, password))

In [5]:
# # To Delete all nodes
# with driver.session() as session:
#     session.run("MATCH (n) DETACH DELETE n")

# # Delete all relationships
# with driver.session() as session:
#     session.run("MATCH ()-[r]->() DELETE r")

In [3]:
int_datasets = ['weather','foot_traffic','web_traffic','social_media','sec_master']

In [4]:
df_dataset_info = pd.read_excel('sample_data.xlsx',sheet_name='dataset_info')
df_column_info = pd.read_excel('sample_data.xlsx',sheet_name='column_info')

Creating the nodes and relationship in neo4j (I am using the free Aura instance). 

We are adding an additional tag called 'embeddable' to the dataset nodes and the reference column nodes so that we can add embeddings to them in the next stage.

In [6]:
# Create a session
with driver.session() as session:
    for dataset in int_datasets:
        print(dataset)
        df = pd.read_excel('sample_data.xlsx',sheet_name=dataset)

        # Create a node for the dataset
        dataset_description = df_dataset_info[df_dataset_info['dataset']==dataset]['description'].iloc[0]
        session.run(f"""CREATE ({dataset}:dataset:embeddable {{name:'{dataset}'
                    ,type:'dataset'
                    ,source:'snowflake'
                    ,table:'graph_db.public.{dataset}'
                    ,description:'{dataset_description}'
                    }})""")

        # Create nodes for each column in the DataFrame
        for column in df.columns:
            row = df_column_info[(df_column_info['dataset']==dataset)&(df_column_info['column']==column)].iloc[0]
            col_type = row['type']
            col_description = row['description']
            unique_values = ','.join(list(df[column].astype(str).unique()))
            if col_type == 'reference':
                session.run(f"""CREATE ({column}:column:embeddable {{name:'{column}'
                            ,type:'column'
                            ,source:'{dataset}'
                            ,col_type:'{col_type}'
                            ,description:'{col_description}'
                            ,values:'{unique_values}'
                            }})""")
            else:
                session.run(f"""CREATE ({column}:column {{name:'{column}'
                            ,type:'column'
                            ,source:'{dataset}'
                            ,col_type:'{col_type}'
                            ,description:'{col_description}'
                            ,values:'{unique_values}'
                            }})""")


        # Create relationships between the dataset and each column
        for column in df.columns:
            session.run(f"MATCH ({dataset}:dataset {{name: '{dataset}'}}), ({column}:column {{name: '{column}', source: '{dataset}'}}) CREATE ({dataset})-[:HAS_COLUMN]->({column})")


weather
foot_traffic
web_traffic
social_media
sec_master


In [7]:
# Sample Query
query = """
MATCH (n) 
RETURN count(n)
"""
with driver.session() as session:
  result = session.run(query)
  df_query = result.to_df()

df_query

Unnamed: 0,count(n)
0,46


In [9]:
# # this is in case if you want to manually create the relationships instead of using the LLM to infer it in the next step. Mostly for testing purpose.
# with driver.session() as session:
#     session.run("""MATCH (a:column {name:'post_code',source:'foot_traffic'}), (b:column {name:'zip_code',source:'weather'})
#                  CREATE (a)-[:RELATED_TO {confidence:'90',type:'same'}]->(b)""")
#     session.run("""MATCH (a:column {name:'symbol',source:'foot_traffic'}), (b:column {name:'ticker',source:'sec_master'})
#                  CREATE (a)-[:RELATED_TO {confidence:'90',type:'same'}]->(b)""")
#     session.run("""MATCH (a:column {name:'website_owner',source:'web_traffic'}), (b:column {name:'entity_name',source:'sec_master'})
#                  CREATE (a)-[:RELATED_TO {confidence:'90',type:'same'}]->(b)""")
#     session.run("""MATCH (a:column {name:'website_brand',source:'web_traffic'}), (b:column {name:'page_owner',source:'social_media'})
#                  CREATE (a)-[:RELATED_TO {confidence:'90',type:'same'}]->(b)""")

### Loading Data into Snowflake
Optional if you want to load build chains and agents.

In [7]:
import snowflake.connector
from snowflake.snowpark import Session

USER = os.getenv("SNOW_USER")
PASSWORD = os.getenv("SNOW_PASSWORD")
ACCOUNT = os.getenv("SNOW_ACCOUNT")
WAREHOUSE = os.getenv("SNOW_WAREHOUSE")
DATABASE = os.getenv("SNOW_DATABASE")
ROLE = os.getenv("SNOW_ROLE")
SCHEMA = os.getenv("SNOW_SCHEMA")

con = snowflake.connector.connect(
    user = USER,
    password = PASSWORD,
    account = ACCOUNT,
    warehouse = WAREHOUSE,
    database = DATABASE,
    role = ROLE,
    schema = SCHEMA
)
connection_parameters = {
"account": ACCOUNT,
"user": USER,
"password": PASSWORD,
"role": ROLE,
"warehouse": WAREHOUSE,
"database": DATABASE,
"schema":SCHEMA
}  
snow_session = Session.builder.configs(connection_parameters).create()


Failed to import ArrowResult. No Apache Arrow result set format can be used. ImportError: DLL load failed while importing arrow_iterator: The specified procedure could not be found.


In [8]:
sql = """
create database if not exists graph_db
"""
df = pd.read_sql(sql,con)

In [9]:
db = 'graph_db'
for d in int_datasets:
    print(d)
    df = pd.read_excel('sample_data.xlsx',sheet_name=d)
    df.columns = [c.upper() for c in df.columns]
    df_snowpark = snow_session.create_dataframe(df)
    df_snowpark.write.mode("overwrite").save_as_table(f"{db}.public.{d}")

weather
foot_traffic
web_traffic
social_media
sec_master
