# NebulaGraph Property Graph Index

NebulaGraph is an open-source distributed graph database built for super large-scale graphs with milliseconds of latency.

If you already have an existing graph, please skip to the end of this notebook.

In [8]:
%pip install llama-index llama-index-graph-stores-nebula jupyter-nebulagraph nest_asyncio

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Docker Setup

To launch NebulaGraph locally, first ensure you have docker installed. Then, you can launch the database with the following docker command.

```bash
mkdir nebula-docker-compose
cd nebula-docker-compose
curl --output docker-compose.yaml https://raw.githubusercontent.com/vesoft-inc/nebula-docker-compose/master/docker-compose-lite.yaml
docker compose up 
```

After this, you are ready to create your first property graph!

> Other options/details for deploying NebulaGraph can be found in the [docs](https://docs.nebula-graph.io/):
>
> - [ad-hoc cluster in Google Colab](https://docs.nebula-graph.io/master/4.deployment-and-installation/2.compile-and-install-nebula-graph/8.deploy-nebula-graph-with-lite/).
> - [Docker Desktop Extension](https://docs.nebula-graph.io/master/2.quick-start/1.quick-start-workflow/).


In [1]:
import os

NEBULA_HOST = "127.0.0.1"
NEBULA_PORT = 9669
NEBULA_USER = "root"
NEBULA_PASSWORD = "nubela"
NEBULA_SPACE_NAME = "example"

os.environ["NEBULA_USER"] = NEBULA_USER
os.environ["NEBULA_PASSWORD"] = NEBULA_PASSWORD
os.environ["NEBULA_ADDRESS"] = f"{NEBULA_HOST}:{NEBULA_PORT}"

In [2]:
# load NebulaGraph Jupyter extension to enable %ngql magic
%load_ext ngql

# connect to NebulaGraph service
%ngql --address $NEBULA_HOST --port $NEBULA_PORT --user $NEBULA_USER --password $NEBULA_PASSWORD

# create a graph space(think of a Database Instance) named: example
%ngql CREATE SPACE IF NOT EXISTS $NEBULA_SPACE_NAME(vid_type=FIXED_STRING(256));

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


In [3]:
%ngql SHOW SPACES;

Unnamed: 0,Name
0,NL2SQL
1,example
2,phillies_rag


In [8]:
# use the graph space, which is similar to "use database" in MySQL
# The space was created in async way, so we need to wait for a while before using it, retry it if failed
%ngql USE $NEBULA_SPACE_NAME;

In [9]:
# entity name
TAG_TABLE_NAME, TAG_COLUMN_NAME = "table", "column"

# edge name
EDGE_INCLUDE_NAME, EDGE_FOREIGN_NAME = "include", "foreign_key"

# index name
INDEX_TABLE_NAME, INDEX_COLUMN_NAME = "table_index", "column_index"

In [10]:
# create tags
%ngql CREATE TAG IF NOT EXISTS $TAG_TABLE_NAME(name string)
%ngql CREATE TAG IF NOT EXISTS $TAG_COLUMN_NAME(name string)

# create edges
%ngql CREATE EDGE IF NOT EXISTS $EDGE_INCLUDE_NAME(label string)
%ngql CREATE EDGE IF NOT EXISTS $EDGE_FOREIGN_NAME(label string)

# create index
%ngql CREATE TAG INDEX IF NOT EXISTS $INDEX_TABLE_NAME ON table(name(256))
%ngql CREATE TAG INDEX IF NOT EXISTS $INDEX_COLUMN_NAME ON column(name(256))

## Env Setup

We need just a few environment setups to get started.

In [11]:
import nest_asyncio

nest_asyncio.apply()

In [12]:
import os

os.environ["OPENAI_API_KEY"] = "sk-xxx"

In [13]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import 

Settings.llm = OpenAI(
    model="qwen",
    temperature=0.3,
    api_base="https://xxx.com/v1/chat/completions",
    default_headers={"csb-token": "xxx"},
    additional_kwargs={"top_k": 5},
)
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

In [14]:
from llama_index.core.storage import StorageContext
from llama_index.graph_stores.nebula import NebulaGraphStore

tags = [TAG_TABLE_NAME, TAG_COLUMN_NAME]
edge_types = [EDGE_INCLUDE_NAME, EDGE_FOREIGN_NAME]

graph_store = NebulaGraphStore(
    space_name=NEBULA_SPACE_NAME,
    edge_types=edge_types,
    rel_prop_names=["label", "label"],
    tags=tags,
    tag_prop_names=["name", "name"],
)

In [15]:
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore

property_graph_store = NebulaPropertyGraphStore(space=NEBULA_SPACE_NAME)

In [16]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=["data/all_tables.csv", "data/relation_ship.csv"]).load_data()

assert len(documents) == 2
print(f"tables: \n{documents[0].get_content()}")
print("-" * 20)
print(f"foreign_keys: \n{documents[1].get_content()}")

tables: 
cars_data
car_names
model_list
car_makers
--------------------
foreign_keys: 
cars_data, id,  car_names,  model
car_names,  model,  model_list,  model_id
model_list,  model_id,  car_makers,  id


In [17]:
from llama_index.core.indices import load_index_from_storage
from llama_index.core.indices.property_graph import PropertyGraphIndex
from transformation.triplet_extractor import CustomizeTripletExtractor
from transformation.text_spliter import LineTextSpliter

index_persist_dir = "storage_graph"

try:
    storage_context = StorageContext.from_defaults(persist_dir=index_persist_dir, graph_store=graph_store)
    pg_index = load_index_from_storage(storage_context=storage_context, index_id=[INDEX_TABLE_NAME, INDEX_COLUMN_NAME])
    index_loaded = True
except:
    index_loaded = False

if not index_loaded:
    storage_context = StorageContext.from_defaults(graph_store=graph_store, property_graph_store=property_graph_store)

    pg_index = PropertyGraphIndex.from_documents(
        documents=documents,
        storage_context=storage_context,
        show_progress=True,
        use_async=False,
        transformations=[LineTextSpliter()],
        kg_extractors=[
            CustomizeTripletExtractor(directory="data", rel_column=EDGE_INCLUDE_NAME, rel_foreign_key=EDGE_FOREIGN_NAME)
        ],
    )

    pg_index.storage_context.persist(persist_dir=index_persist_dir)

Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting triplets:   0%|          | 0/7 [00:00<?, ?it/s]

text: cars_data


Extracting triplets:  14%|█▍        | 1/7 [01:03<06:18, 63.15s/it]

text: car_names
text: model_list
text: car_makers
text: cars_data, id,  car_names,  model
t1: cars_data, c1: id, t2: car_names, c2: model


Extracting triplets:  71%|███████▏  | 5/7 [01:14<00:23, 11.90s/it]

text: car_names,  model,  model_list,  model_id
t1: car_names, c1: model, t2: model_list, c2: model_id


Extracting triplets:  86%|████████▌ | 6/7 [01:17<00:09,  9.65s/it]

text: model_list,  model_id,  car_makers,  id
t1: model_list, c1: model_id, t2: car_makers, c2: id


Extracting triplets: 100%|██████████| 7/7 [01:21<00:00, 11.70s/it]


Generating embeddings:   0%|          | 0/7 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/42 [00:00<?, ?it/s]

Now that the graph is created, we can explore it with [jupyter-nebulagraph](https://github.com/wey-gu/jupyter_nebulagraph)

In [18]:
%ngql SHOW TAGS

Unnamed: 0,Name
0,Chunk__
1,Entity__
2,Node__
3,Props__
4,column
5,table


In [19]:
%ngql SHOW EDGES

Unnamed: 0,Name
0,Relation__
1,__meta__node_label__
2,__meta__rel_label__
3,foreign_key
4,include


In [20]:
%ngql MATCH p=(v:Entity__)-[r]->(t:Entity__) RETURN v.Entity__.name AS src, r.label AS relation, t.Entity__.name AS dest LIMIT 15;

Unnamed: 0,src,relation,dest
0,cars_data,include,cars_data::accelerate
1,cars_data,include,cars_data::cylinders
2,cars_data,include,cars_data::edispl
3,cars_data,include,cars_data::horsepower
4,cars_data,include,cars_data::id
5,cars_data,include,cars_data::mpg
6,cars_data,include,cars_data::weight
7,cars_data,include,cars_data::year
8,model_list::model_id,foreign_key,car_makers::id
9,car_names::model,foreign_key,model_list::model_id


In [21]:
%ngql MATCH p=(v:Entity__)-[r]->(t:Entity__) RETURN p LIMIT 2;

Unnamed: 0,p
0,"(""cars_data"" :Node__{label: ""table""} :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""} :Entity__{name: ""cars_data""})-[:Relation__@0{label: ""include"", file_path: __NULL__, file_name: __NULL__, file_type: __NULL__, file_size: __NULL__, _node_type: __NULL__, creation_date: __NULL__, document_id: __NULL__, last_modified_date: __NULL__, doc_id: __NULL__, _node_content: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""}]->(""cars_data::accelerate"" :Node__{label: ""column""} :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""} :Entity__{name: ""cars_data::accelerate""})"
1,"(""cars_data"" :Node__{label: ""table""} :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""} :Entity__{name: ""cars_data""})-[:Relation__@0{label: ""include"", file_path: __NULL__, file_name: __NULL__, file_type: __NULL__, file_size: __NULL__, _node_type: __NULL__, creation_date: __NULL__, document_id: __NULL__, last_modified_date: __NULL__, doc_id: __NULL__, _node_content: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""}]->(""cars_data::cylinders"" :Node__{label: ""column""} :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""817a463b-4f21-4ddf-8ee9-4dc2a223700a""} :Entity__{name: ""cars_data::cylinders""})"


## Querying and Retrieval

In [22]:
retriever = pg_index.as_retriever(
    include_text=False,  # include source text in returned nodes, default True
)

nodes = retriever.retrieve("what property cars_data include")

for node in nodes:
    print(node.text)

cars_data::mpg ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'}) -> include -> cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'})
cars_data::id ({'triplet_source_id': '05fee051-5cd2-46d3-b635-570f68b27bf0'}) -> include -> cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'})
cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'}) -> include -> cars_data::accelerate ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'})
cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'}) -> include -> cars_data::cylinders ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'})
cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'}) -> include -> cars_data::edispl ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'})
cars_data ({'triplet_source_id': '817a463b-4f21-4ddf-8ee9-4dc2a223700a'}) -> include -> cars_data::horsepower ({'triplet_source_id': '817a463b-4f21-4d

In [23]:
query_engine = pg_index.as_query_engine(include_text=True)

response = query_engine.query("what property cars_data include?")

print(str(response))

cars_data includes the following properties: mpg, accelerate, cylinders, edispl, horsepower, year, and weight.
