# GraphRAG with Azure OpenAI and NEO4J - Wikipedia example

https://console.neo4j.io/

<img src="img/neo4J_vectors.jpg">

In [1]:
#%pip install neo4j

In [27]:
import datetime
import openai
import os
import sys
import time

from dotenv import load_dotenv
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Neo4jVector
from langchain_openai import AzureOpenAIEmbeddings
from neo4j import GraphDatabase

In [3]:
print(f"Python version: {sys.version}")
print(f"OpenAI version: {openai.__version__}")

Python version: 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]
OpenAI version: 1.35.1


## Settings

In [4]:
load_dotenv("azure.env")

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

azure_endpoint = os.getenv("AOAI_ENDPOINT")
api_key = os.getenv("AOAI_KEY")
api_version = os.getenv("AOAI_VERSION")

In [5]:
os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD

In [6]:
embedding_model = "text-embedding-ada-002"
model = "gpt-4o"
azure_deployment = "gpt-4o"

## Helper functions

In [7]:
def get_wikipedia_data(query: str):
    """
    Get data from Wikipedia
    """
    print("Get wikipedia data using query:", query)
    docs = WikipediaLoader(query=query).load()
    print("Done")

    return docs

In [8]:
def process_data(docs):
    """
    Chunk data
    """
    print("Chunking the data...")

    # chunking
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,
        chunk_overlap=40,
    )
    # Chunk the doc
    documents = text_splitter.split_documents(docs)

    for d in documents:
        del d.metadata["summary"]

    print("Done")

    return documents

In [9]:
def store_data_neo4j(documents):
    """
    Store and index text with Neo4j
    """
    print("Storing data to NEO4J...")

    Neo4jVector.from_documents(
        documents,
        embeddings,
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
    )

    print("Done")

In [10]:
def init_neo4j_vector(index_name):
    """
    Function to instantiate a Neo4j vector from an existing vector
    """
    neo4j_vector = Neo4jVector.from_existing_index(
        embeddings,
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        index_name=index_name,
    )

    return neo4j_vector

In [37]:
def ask_neo4j_vector(index_name, query):
    """
    Query on NEO4J
    """
    
    start_time = datetime.datetime.now()
    print("\033[1;31;35m")
    print(f"Your query: {query}")
    print("\033[1;31;34m")

    try:
        neo4j_vector = init_neo4j_vector(index_name)
        results = similarity_search(neo4j_vector, query)
        neo4j_vector._driver.close()
        print(results[0].page_content)
        time_difference = datetime.datetime.now() - start_time
        time_difference_ms = round(time_difference.total_seconds() * 1000)

        print("\033[1;31;32m")
        print(f"Powered by Azure OpenAI & NEO4J. Done in {time_difference_ms} ms")

    except Exception as e:
        print(f"[ERROR] {e}")

In [12]:
def similarity_search(neo4j_vector, query):
    """
    Function to perform a vector similarity search.
    """
    try:
        results = neo4j_vector.similarity_search(query)

    except Exception as e:
        print(f"[ERROR] {e}")

    return results

## Embedddings

In [13]:
embeddings = AzureOpenAIEmbeddings(
    model=embedding_model,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    openai_api_version=api_version,
)

In [14]:
index = "vector"

## Example

### Get wikipedia data

In [15]:
wikipedia_query = "OpenAI"

In [16]:
docs = get_wikipedia_data(wikipedia_query)

Get wikipedia data using query: OpenAI
Done


### Processing data

In [17]:
processed_docs = process_data(docs)

Chunking the data...
Done


### Storing data into NEO4J

In [18]:
store_data_neo4j(processed_docs)

Storing data to NEO4J...




Done


### Testing

We have our embedded vectors in NEO4J:
<img src="img/neo4J_vectors.jpg">

In [38]:
query = "What is the CEO of this company?"

ask_neo4j_vector(index, query)

[1;31;35m
Your query: What is the CEO of this company?
[1;31;34m
Samuel Harris Altman (born April 22, 1985) is an American entrepreneur and investor best known as the CEO of OpenAI since 2019 (he was briefly fired and reinstated in November 2023). He is also the chairman of clean energy companies Oklo Inc. and Helion Energy. Altman is considered to be one of the leading figures of the AI boom. He dropped out of Stanford University after two years and founded Loopt, a mobile social networking service, raising more than $30 million in venture capital. In 2011, Altman joined Y Combinator, a startup accelerator, and was its president from 2014 to 2019.


== Early life and education ==
Altman was born on April 22, 1985, in Chicago, Illinois, into a Jewish family, and grew up in St. Louis, Missouri. His mother is a dermatologist, while his father was a real estate broker. Altman is the eldest of four siblings. At the age of eight, he received his first computer, an Apple Macintosh, and beg

In [39]:
query = "What is openAI?"

ask_neo4j_vector(index, query)

[1;31;35m
Your query: What is openAI?
[1;31;34m
OpenAI is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco. Its mission is to develop "safe and beneficial" artificial general intelligence, which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI.
The organization consists of the non-profit OpenAI, Inc. registered in Delaware and its for-profit subsidiary OpenAI Global, LLC. Microsoft owns roughly 49% of OpenAI's equity, having invested US$13 billion. It also provides computing resources to OpenAI through its Microsoft Azure cloud platform.
In 2023 and 2024, OpenAI 

In [40]:
query = "What can you tell about GPT-4?"

ask_neo4j_vector(index, query)

[1;31;35m
Your query: What can you tell about GPT-4?
[1;31;34m
Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was launched on March 14, 2023, and made publicly available via the paid chatbot product ChatGPT Plus, via OpenAI's API, and via the free chatbot Microsoft Copilot.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 
Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4, equipped with vision capabilities (GPT-4V), is capable of taking images a

In [41]:
query = "What can you tell about Microsoft and OpenAI?"

ask_neo4j_vector(index, query)

[1;31;35m
Your query: What can you tell about Microsoft and OpenAI?
[1;31;34m
OpenAI is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco. Its mission is to develop "safe and beneficial" artificial general intelligence, which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI.
The organization consists of the non-profit OpenAI, Inc. registered in Delaware and its for-profit subsidiary OpenAI Global, LLC. Microsoft owns roughly 49% of OpenAI's equity, having invested US$13 billion. It also provides computing resources to OpenAI through its Microsoft Azure cloud platf