# Writing To Vector DB

Write the content (scraped, chunked, embedded) to Neo4j. 

In [1]:
!pip install azure-identity azure-keyvault-secrets



In [2]:
!pip install neomodel



### Setup Keyvault

In [3]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

key_vault_name = "kv-bsauwmno"
kv_uri = f"https://{key_vault_name}.vault.azure.net/"

credential = DefaultAzureCredential()
client = SecretClient(vault_url=kv_uri, credential=credential)

# Now you can use neo4j_url, neo4j_port, and neo4j_password in your application
neo4j_url = client.get_secret("NEO4JURL").value
neo4j_user = client.get_secret("NEO4JUSER").value
neo4j_password = client.get_secret("NEO4JPASSWORD").value

In [4]:
import pandas as pd

# Define Neo4j Datamodel

Using neomodel

In [42]:
from neomodel.core import NodeClassAlreadyDefined
from neomodel import (
    config, 
    StructuredNode, 
    StringProperty, 
    IntegerProperty,
    UniqueIdProperty, 
    RelationshipTo, 
    ArrayProperty
)
from neomodel.contrib import SemiStructuredNode

try: 
    class Chunk(StructuredNode):
        chunk_id = IntegerProperty()
        embedding_model = StringProperty() 
        embedding = ArrayProperty()
        text = StringProperty()
        chunk_order = IntegerProperty()
        chunk_size = IntegerProperty()
        chunk_overlap = IntegerProperty()
        # String, Chunk doesn't exist yet. 
        next_chunk = RelationshipTo('Chunk', 'NEXT_CHUNK')

    class WebPage(StructuredNode):
        webpage_id = IntegerProperty(index=True, default=0)
        url = StringProperty(unique_index=True, required=True)
        content = StringProperty(unique_index=False, required=False)
        title = StringProperty(unique_index=False, required=False)
        scrape_dt: str = StringProperty(unique_index=False, required=False)
        chunk = RelationshipTo(Chunk, 'HAS_CHUNK')

    class Catalog(StructuredNode):
        catalog_id = IntegerProperty(index=True, default=0)
        name = StringProperty(unique_index=True, required=True)
        url = StringProperty(unique_index=False, required=False)
        webpage = RelationshipTo(WebPage, 'HAS_WEBPAGE')
except NodeClassAlreadyDefined as e: 
    print("Classes already defined.")

Classes already defined.


In [6]:
from neo4j import GraphDatabase
from neomodel import db

config.DATABASE_NAME = "neo4j"
config.DATABASE_URL = (
    f"bolt://{neo4j_user}:{neo4j_password}@" + 
    neo4j_url.replace('bolt://', '').replace('/:7687', ':7687/neo4j')
)

# Using URL - auto-managed
db.set_connection(url=config.DATABASE_URL)


# Using Driver
# neo_driver = GraphDatabase().driver(
#     neo4j_url, 
#     auth=(neo4j_user, neo4j_password)
# )
# config.DRIVER = neo_driver
# db.set_connection(driver=my_driver)


# Save Catalog Nodes

Single hard-coded node for now. 

In [9]:
# Read from pikle file
catalog_df = pd.read_pickle('./data/catalog_raw.pkl')

catalog_df.head()

Unnamed: 0,id,title,url
0,0,24 uur observatie van de pasgeborene,https://www.azstlucas.be/onderzoek-en-behandel...
1,1,24 uur ph-metrie meting,https://www.azstlucas.be/onderzoek-en-behandel...
2,2,24 uur stoelgang sparen: richtlijnen,https://www.azstlucas.be/onderzoek-en-behandel...
3,3,24 uur urine sparen,https://www.azstlucas.be/onderzoek-en-behandel...
4,4,24-uurs bloeddrukmeting,https://www.azstlucas.be/onderzoek-en-behandel...


In [11]:
# First Catalog
azstlucas_catalog = (
    Catalog(
        catalog_id=0, 
        name='azstlucas',
        url='https://www.azstlucas.be/onderzoek-en-behandelingen'
    )
)

In [12]:
# Save catalog 
azstlucas_catalog.save()

<Catalog: {'catalog_id': 0, 'name': 'azstlucas', 'url': 'https://www.azstlucas.be/onderzoek-en-behandelingen', 'element_id_property': '4:c86bbba4-35aa-4716-8fd0-27df8c7da669:0'}>

# Save WebPage Nodes

In [13]:
# Read from pikle file
webpages_df = pd.read_pickle('./data/webpages_raw.pkl')

webpages_df

Unnamed: 0,content,banner_title,banner_divisions,intro,toc,url,scrape_date
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50
1,\nWe weten dat reflux en opboeren voor een aan...,24 uur ph-metrie meting,[{'division_url': 'https://www.azstlucas.be/sp...,Een ph-metrie meting met impedantiemeting is e...,"[{'link_url': '#hoe-gebeurt-de-meting', 'link_...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50
2,\n## Algemene richtlijnen\n\n\nIn opdracht van...,24 uur stoelgang sparen: richtlijnen,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#algemene-richtlijnen', 'link_t...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:51
3,\n## Algemene richtlijnen\n\n\nIn opdracht van...,24 uur urine sparen,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#algemene-richtlijnen', 'link_t...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:52
4,\nBij een 24-uurs bloeddrukmeting wordt de blo...,24-uurs bloeddrukmeting,[{'division_url': 'https://www.azstlucas.be/sp...,Als je last hebt van hoge bloeddruk of hyperte...,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:52
...,...,...,...,...,...,...,...
363,\n## Zorgen na een ingreep in de mond\n\n\nDe ...,Wortelpuntbehandeling (apicotomie),[{'division_url': 'https://www.azstlucas.be/sp...,Soms bevindt zich een ontsteking onder de wort...,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:25
364,* Stop bloedverdunners in samenspraak met je h...,Zenuwblokkade,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#wat-is-een-zenuwblokkade', 'li...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:26
365,\nMet deze test meten we de afstand die je kun...,Zes minuten wandeltest,[{'division_url': 'https://www.azstlucas.be/sp...,﻿Deze test wordt gebruikt om de inspanningscap...,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:27
366,\nZuurstof (O2) is een kleurloos en reukloos g...,Zuurstoftherapie,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#conventie', 'link_title': 'Con...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:27


In [14]:
for i, obj in webpages_df.iterrows():
    page_content: str = obj['content']
    page_title: str = obj['banner_title']
    page_url: str = obj['url']
    page_scrape_dt: str = obj['scrape_date']
    # TODO: division
    # TODO: intro 
    # TODO: toc

    p = WebPage(
        webpage_id = i,
        url = page_url,
        content = page_content,
        title = page_title,
        scrape_dt = page_scrape_dt
    )
    p.save()

# Catalog to WebPage relationships

In [15]:
catalog = Catalog.nodes.get(catalog_id=0)

# Add connections 
for i, obj in webpages_df.iterrows():
    target = WebPage.nodes.get(webpage_id=i)
    catalog.webpage.connect(target)

# Chunk Nodes

In [7]:
# Read from pikle file
chunks_df = pd.read_pickle('./data/chunks_processed_full.pkl')

# These are the relationships
chunks_df.head()

Unnamed: 0,content,banner_title,banner_divisions,intro,toc,url,scrape_date,chunk_s500_o60,chunk_order,chunk_s500_o60_embedding
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,## Waarom willen de kinderartsen dat je baby m...,0,"[0.04676021263003349, 0.009244767017662525, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,## Mogelijke afwijkingen\n\n\n### Aangeboren h...,1,"[0.0652947872877121, 0.012688170187175274, -0...."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,### Infecties\n\n\nVerschillende infecties wor...,2,"[0.09477945417165756, 0.012673679739236832, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,### Aangeboren darmafwijkingen\n\n\nHet is pas...,3,"[0.04747367277741432, 0.023175092414021492, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,De kinderarts onderzoekt je baby in normale om...,4,"[0.03207284212112427, 0.011688114143908024, -0..."


In [29]:
i = 0
# Takes index, not row number
for _, obj in chunks_df.iterrows():
    chunk_id: int = i
    chunk_text: str = obj['chunk_s500_o60']
    chunk_text_embedding: str = obj['chunk_s500_o60_embedding']
    chunk_order: str = obj['chunk_order']
    chunk_size: int = 500
    chunk_overlap: int = 60

    c = Chunk(
        chunk_id = chunk_id,
        embedding = chunk_text_embedding,
        embedding_model = 'hkunlp/instructor-xl',
        text = chunk_text,
        chunk_order=chunk_order,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )
    c.save()
    i += 1

### Check embedding length

In [42]:
len(chunks_df.iloc[2]['chunk_s500_o60_embedding'])

768

# Chunk to WebPage Relationships

In [30]:
i = 0
for _, obj in chunks_df.iterrows():
    url: str = obj['url']
    chunk = Chunk.nodes.get(chunk_id=i)
    webpage = WebPage.nodes.get(url=url)
    webpage.chunk.connect(chunk)
    i += 1

# Chunk to Chunk Relationships


In [24]:
chunks_df

Unnamed: 0,content,banner_title,banner_divisions,intro,toc,url,scrape_date,chunk_s500_o60,chunk_order,chunk_s500_o60_embedding
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,## Waarom willen de kinderartsen dat je baby m...,0,"[0.04676021263003349, 0.009244767017662525, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,## Mogelijke afwijkingen\n\n\n### Aangeboren h...,1,"[0.0652947872877121, 0.012688170187175274, -0...."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,### Infecties\n\n\nVerschillende infecties wor...,2,"[0.09477945417165756, 0.012673679739236832, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,### Aangeboren darmafwijkingen\n\n\nHet is pas...,3,"[0.04747367277741432, 0.023175092414021492, -0..."
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50,De kinderarts onderzoekt je baby in normale om...,4,"[0.03207284212112427, 0.011688114143908024, -0..."
...,...,...,...,...,...,...,...,...,...,...
366,\nZuurstof (O2) is een kleurloos en reukloos g...,Zuurstoftherapie,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#conventie', 'link_title': 'Con...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:27,AZ Sint-Lucas maakt gebruik van cookies om uw ...,4439,"[0.04251086339354515, 0.03886432200670242, 0.0..."
367,\nOto-akoestische emissies (OAE’s) zijn geluid...,﻿Oto-akoestische emissies,[{'division_url': 'https://www.azstlucas.be/sp...,,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:28,Oto-akoestische emissies (OAE’s) zijn geluiden...,4440,"[0.0032778072636574507, 0.014101134613156319, ..."
367,\nOto-akoestische emissies (OAE’s) zijn geluid...,﻿Oto-akoestische emissies,[{'division_url': 'https://www.azstlucas.be/sp...,,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:28,Met deze test gaan we na of de uitwendige haar...,4441,"[0.05215251073241234, 0.02360747754573822, 0.0..."
367,\nOto-akoestische emissies (OAE’s) zijn geluid...,﻿Oto-akoestische emissies,[{'division_url': 'https://www.azstlucas.be/sp...,,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:31:28,## Verloop van de test\n\n\nWe brengen een oor...,4442,"[0.07470236718654633, 0.006344643887132406, -0..."


In [41]:
def get_webpage_id(df: pd.DataFrame) -> pd.DataFrame: 
    """
        Note, this is based on the way the chunks_df dataframe has been created, 
        using explode and keeping indexes constant. 
    """
    df = df.reset_index() # Doesn't drop current index (corresponds to webpage groupby id)
    df = df.rename(columns={
        "index": "webpage_id"
    })
    df['chunk_id'] = df.index
    return df

chunks_to_chunks_df = (
    chunks_df.copy(deep=True)
    .pipe(get_webpage_id)
)

# Ensure that number of ids equals number of scraped webpages. 
assert len(chunks_to_chunks_df['webpage_id'].unique()) == 368

def chunk_order_rels(group: pd.core.groupby.generic.DataFrameGroupBy) -> pd.DataFrame: 
    """
    create a DataFrame that captures the relationships between 
    subsequent 'chunk_order' values within each group defined by 'webpage_id'.

    The function will take a group of rows, identify consecutive 
    pairs based on the 'chunk_order' column, and create a new DataFrame 
    that lists these pairs as 'source' and 'target'.
    
    GroupBy: First, group the DataFrame by 'webpage_id'.
    Apply: Then, apply the chunk_order_rels function to each group. 
        This function should take the group as an input and return a DataFrame with the 'source' and 'target' relationships.
    Combine: Pandas will automatically combine the results of 
        the apply function into a single DataFrame.
    """
    # Shift the 'id' column to create pairs of source and target
    group['target_chunk_id'] = group['chunk_id'].shift(-1)

    # Create the resulting DataFrame
    result = group[
            ['webpage_id', 'chunk_id', 'target_chunk_id']
        ][:-1]  # Exclude the last row as it has no target
    
    result.rename(
        columns={
            'chunk_id': 'source_chunk_id', 
        }, 
        inplace=True
    )

    return result

def recast(df: pd.DataFrame, col:str, new_type:object) -> pd.DataFrame: 
    df[col] = df[col].astype(new_type)
    return df

chunks_to_chunks_df = (
    chunks_to_chunks_df
    .sort_values(
        by=['chunk_order'], 
        ignore_index=True, 
        ascending=True
    )
    .groupby('webpage_id')
    .apply(chunk_order_rels)
    .reset_index(drop=True)
    .pipe(recast, col='target_chunk_id', new_type=int)
)

chunks_to_chunks_df.head(10)

Unnamed: 0,webpage_id,source_chunk_id,target_chunk_id
0,0,0,1
1,0,1,2
2,0,2,3
3,0,3,4
4,0,4,5
5,0,5,6
6,0,6,7
7,0,7,8
8,1,9,10
9,1,10,11


In [45]:
for _, rel in chunks_to_chunks_df.iterrows():
    source_chunk = Chunk.nodes.get(chunk_id=rel['source_chunk_id'])
    target_chunk = Chunk.nodes.get(chunk_id=rel['target_chunk_id'])
    source_chunk.next_chunk.connect(target_chunk)

# Manual Driver Queries



## Indexing

In [21]:
import neo4j
import pandas as pd
from typing import List 
from neo4j import GraphDatabase, RoutingControl

def to_pandas(records: List[object]) -> pd.DataFrame:
    return pd.DataFrame.from_records([dict(r) for r in records])

def add_fts_index(driver, label="Chunk", prop="text"):
    driver.execute_query(
        f"""
            CREATE FULLTEXT INDEX {"fts_" + label + "_" + prop} IF NOT EXISTS
            FOR (n:{label})
            ON EACH [n.{prop}]
        """,
        database_="neo4j",
    )

def get_indices(driver, db='neo4j'): 
    records, _, _ = driver.execute_query(
        "SHOW ALL INDEXES",
        database_= db, 
        routing_= RoutingControl.READ,
    )
    return to_pandas(records)

def get_chunks(driver, limit=20, db="neo4j") -> pd.DataFrame:
    records, _, _ = driver.execute_query(
        "MATCH (c:Chunk)"
        "RETURN c.text limit $limit",
        database_=db, 
        limit=limit,
        routing_= RoutingControl.READ,
    )
    return to_pandas(records)

def add_vector_index(
        driver, 
        label="Chunk", 
        prop="embedding", 
        idx_type="cosine", 
        vec_size=768): 
    driver.execute_query(
        f"""
            CALL db.index.vector.createNodeIndex(
                '{("vi_" + label + "_" + prop + "_" + idx_type).lower()}', 
                '{label}',
                '{prop}', 
                {vec_size}, 
                '{idx_type}'
            )
        """,
        database_="neo4j",
    ) 

with GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password)) as driver:
    add_fts_index(driver)
    add_vector_index(driver)
    index_df = get_indices(driver)
    chunk_df = get_chunks(driver, limit=20)

# df.head()
index_df

Unnamed: 0,id,name,state,populationPercent,type,entityType,labelsOrTypes,properties,indexProvider,owningConstraint,lastRead,readCount
0,3,fts_Chunk_text,POPULATING,21.321962,FULLTEXT,NODE,[Chunk],[text],fulltext-1.0,,,
1,1,index_343aff4e,ONLINE,100.0,LOOKUP,NODE,,,token-lookup-1.0,,2023-11-13T20:38:06.185000000+00:00,569.0
2,2,index_f7700477,ONLINE,100.0,LOOKUP,RELATIONSHIP,,,token-lookup-1.0,,,0.0
3,4,vi_chunk_embedding_cosine,POPULATING,21.321962,VECTOR,NODE,[Chunk],[embedding],vector-1.0,,,


In [45]:
chunk_df.head()

Unnamed: 0,c.text
0,## Waarom willen de kinderartsen dat je baby m...
1,## Mogelijke afwijkingen\n\n\n### Aangeboren h...
2,### Infecties\n\n\nVerschillende infecties wor...
3,### Aangeboren darmafwijkingen\n\n\nHet is pas...
4,De kinderarts onderzoekt je baby in normale om...
