#### Limitations of vector RAG 
1. **Themes and relationships** - Document embedding captures semantic meaning but struggles to capture themes and relationships between entities in the document corpus.
2. **Scalability** - as the volume of the database grows, the retrieval process can become less efficient, as the computational load increases with the search space.
3. **Diverse Data** - the structured and diverse data are harder to embed. 

In [1]:
import os 
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.documents import Document 
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.retrievers import BM25Retriever
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI 
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator
from ragas.integrations.langchain import EvaluatorChain #!pip install ragas==0.1.22
from ragas.metrics import faithfulness, ContextPrecision
import nest_asyncio 
nest_asyncio.apply()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
pdf_loader = PyPDFLoader(file_path='../../../data/pilot-manual.pdf')

openai_api_key = os.environ.get('OPENAI_API_KEY')
# 1st model to do embedding. 
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-3-small')
semantic_splitter = SemanticChunker(
    embeddings=embeddings, 
    breakpoint_threshold_amount=0.8)
documents = pdf_loader.load()
chunks = semantic_splitter.split_documents(documents)

bm25_retriever = BM25Retriever.from_documents(
    documents=chunks, k=5)


ValueError: File path ../../../data/pilot-manual-787.pdf is not a valid file or url

#### Document to graph

#### create nodes and edges data structure

In [3]:
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
import os 

openai_api_key = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI (api_key = openai_api_key, temperature=0, model_name="gpt-4o-mini")
llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(graph_documents)

[GraphDocument(nodes=[Node(id='Pmdg 737', type='Aircraft'), Node(id='Tutorial Flight', type='Flight'), Node(id='Pmdg Simulations', type='Company'), Node(id='July 2022', type='Date')], relationships=[Relationship(source=Node(id='Pmdg 737', type='Aircraft'), target=Node(id='Tutorial Flight', type='Flight'), type='HAS_TUTORIAL'), Relationship(source=Node(id='Pmdg Simulations', type='Company'), target=Node(id='Pmdg 737', type='Aircraft'), type='DEVELOPER'), Relationship(source=Node(id='July 2022', type='Date'), target=Node(id='Tutorial Flight', type='Flight'), type='PUBLISHED_IN')], source=Document(metadata={'source': '../../../data/pilot-manual-787.pdf', 'page': 0}, page_content=' PMDG 7 37 0.00.1 \n  TUTORIAL FLIGHT  \n \nFor Simulator Use Only  DO NOT DUPLICATE  JULY 2022  \n \n \n \n \n \nPMDG 737 \n \n \nTutorial Flight  \n \nCopyright © 2011-2022 \nPMDG Simulations  \nAll Rights Reserved  \n   \n')), GraphDocument(nodes=[Node(id='Pmdg 737', type='Simulator'), Node(id='Microsoft Fligh

#### render the graph using neo4j

In [5]:
import os 
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

openai_api_key = os.getenv("OPENAI_API_KEY")
pwd=os.getenv("NEO4J_PASSWORD")
graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password="esoterics17")

# generation of graph from the text using LLM
llm = ChatOpenAI(api_key=openai_api_key, temperature=0, model_name="gpt-4o-mini")
llm_transformer = LLMGraphTransformer(llm=llm)
graph_documents = llm_transformer.convert_to_graph_documents(documents[:3])

# adding the graph into the Neo4j database
graph.add_graph_documents(
    graph_documents=graph_documents,
    include_source=True,
    baseEntityLabel=True 
)
graph.refresh_schema()
schema = graph.get_schema
print(schema)

Node properties:
Document {id: STRING, text: STRING, source: STRING, page: INTEGER}
Aircraft {id: STRING}
Flight {id: STRING}
Company {id: STRING}
Date {id: STRING}
Simulator {id: STRING}
Copyright {id: STRING}
Software {id: STRING}
Organization {id: STRING}
Relationship properties:

The relationships:
(:Document)-[:MENTIONS]->(:Aircraft)
(:Document)-[:MENTIONS]->(:Simulator)
(:Document)-[:MENTIONS]->(:Flight)
(:Document)-[:MENTIONS]->(:Company)
(:Document)-[:MENTIONS]->(:Date)
(:Document)-[:MENTIONS]->(:Copyright)
(:Document)-[:MENTIONS]->(:Document)
(:Document)-[:MENTIONS]->(:Software)
(:Document)-[:MENTIONS]->(:Organization)
(:Aircraft)-[:HAS_TUTORIAL]->(:Flight)
(:Aircraft)-[:USED_WITH]->(:Simulator)
(:Company)-[:HOLD]->(:Copyright)
(:Company)-[:DEVELOPER]->(:Aircraft)
(:Company)-[:DEVELOPER]->(:Simulator)
(:Company)-[:SUPPLIER_OF]->(:Company)
(:Company)-[:LICENSED_TO]->(:Aircraft)
(:Company)-[:LICENSED_TO]->(:Simulator)
(:Company)-[:OWNS]->(:Simulator)
(:Company)-[:GRANTS_LICENSE]

#### Querying the Graph 

![](images/cipher.png)

In [22]:
# query the graph via manual Cypher query
results = graph.query("""
MATCH (m:Model {id:"Gpt-3"})-[:DEVELOPED_BY]->(org:Organization)
RETURN org
""")
print (results)
print (results[0]['org']['id']) # first arg is 0 because there is only one element in the array. the other subscripts are based on the structure 

[{'org': {'id': 'Openai'}}]
Openai


```cypher
MATCH (p:Person)-[:KNOWN_FOR]->(c:Concept {id: 'Theory of Relativity'})
RETURN p
```
**Explanation:**
- ```(p:Person)```: Matches nodes labeled Person.
- ```[:KNOWN_FOR]```: Matches the relationship indicating the person is known for a particular concept.
- ```(c:Concept {id: 'Theory of Relativity'})```: Matches nodes labeled Concept with the id property set to "Theory of Relativity".
- ```RETURN p```: Returns the Person node(s) that match this condition.

**code for getting scientist who is known for Theory of Relativity**
```python   
# Print the graph schema
print(graph.get_schema)

# Query the graph
results = graph.query("""
MATCH (relativity:Concept {id: "Theory Of Relativity"}) <-[:KNOWN_FOR]- (scientist:Person)
return scientist
""")

print(results[0])
```


![](images/graphical-rag-arch.png)
![](images/graph-cypher-qa-chain.png)

Steps - 
1. Receive a user input
2. Use LLM to convert the user input to Cypher query
3. Query the graph database using the Cypher query
4. Use LLM to convert the retrieved graph document into natural language. 
5. Return the natural language response to the user. 

In [None]:
# query the graph using automated cypher query using  llm 
# automated querying not working!! 
from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
chain = GraphCypherQAChain.from_llm(
    llm=ChatOpenAI(api_key=openai_api_key, temperature=0, model_name="gpt-4o-mini"), graph=graph, verbose=True
)
result = chain.invoke({"query":"Which organization developed Gpt-3?"})
print(result)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (o:Organization)-[:DEVELOPED_BY]->(c:Concept {id: 'Gpt-3'})
RETURN o
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
{'query': 'Which organization developed Gpt-3?', 'result': "I don't know the answer."}


#### Improving the user to cypher query generation 
1. Filter graph schema
2. Validate cypher query 
3. Few shot prompting 

In [29]:
# filtering did not help. 
# putting validate_cypher = True helped! 
chain = GraphCypherQAChain.from_llm(
    llm=ChatOpenAI(api_key=openai_api_key, temperature=0, model_name="gpt-3.5-turbo"), graph=graph, verbose=True, exclude_types=["Document","Concept","Date","Tmeperiod","Dataset","Event","Technology","Paper","Person","Leaderboard","Architecture"], validate_cypher=True
)
result = chain.invoke({"query":"Which organization developed Gpt-3?"})
print(result)
graph.refresh_schema
print (graph.get_schema)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (m:Model {id: 'Gpt-3'})-[:DEVELOPED_BY]->(o:Organization)
RETURN o;[0m
Full Context:
[32;1m[1;3m[{'o': {'id': 'Openai'}}][0m

[1m> Finished chain.[0m
{'query': 'Which organization developed Gpt-3?', 'result': 'Openai'}
Node properties:
Document {id: STRING, text: STRING, summary: STRING, source: STRING, title: STRING}
Concept {id: STRING}
Date {id: STRING}
Model {id: STRING}
Timeperiod {id: STRING}
Dataset {id: STRING}
Organization {id: STRING}
Event {id: STRING}
Technology {id: STRING}
Paper {id: STRING}
Person {id: STRING}
Leaderboard {id: STRING}
Architecture {id: STRING}
Ai model {id: STRING}
Entity {id: STRING}
Ai assistant {id: STRING}
Technique {id: STRING}
Language model {id: STRING}
Company {id: STRING}
Software {id: STRING}
Product {id: STRING}
Measurement {id: STRING}
Equation {id: STRING}
Unknown {id: STRING}
Variable {id: STRING}
Constant {id: STRING}
Mathematical expression {id

![](images/graph-pdf.png)

![](images/graph.pilot-man.png)

#### Few shot prompting

In [33]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

examples = [
    {
        "question": "How many notable large language models are mentioned in the article?",
        "query": "MATCH (m:Concept {id: 'Large Language Model'}) RETURN count(DISTINCT m)",
    },
    {
        "question": "Which companies or organizations have developed the large language models mentioned?",
        "query": "MATCH (o:Organization)-[:DEVELOPS]->(m:Concept {id: 'Large Language Model'}) RETURN DISTINCT o.id",
    },
    {
        "question": "What is the largest model size mentioned in the article, in terms of number of parameters?",
        "query": "MATCH (m:Concept {id: 'Large Language Model'}) RETURN max(m.parameters) AS largest_model",
    },
]

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}")

cypher_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\n Below are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question"],
)

print(cypher_prompt)
llm=ChatOpenAI(api_key=openai_api_key, temperature=0, model_name="gpt-3.5-turbo")
# Create the graph Cypher QA chain
graph_qa_chain = GraphCypherQAChain.from_llm(
    llm=llm, graph=graph, verbose=True, validate_cypher=True)

# Invoke the chain with the input provided
result = graph_qa_chain.invoke({"query": "Which organization created Gpt-3?"})
print(f"Final answer: {result['result']}")

input_variables=['question', 'schema'] examples=[{'question': 'How many notable large language models are mentioned in the article?', 'query': "MATCH (m:Concept {id: 'Large Language Model'}) RETURN count(DISTINCT m)"}, {'question': 'Which companies or organizations have developed the large language models mentioned?', 'query': "MATCH (o:Organization)-[:DEVELOPS]->(m:Concept {id: 'Large Language Model'}) RETURN DISTINCT o.id"}, {'question': 'What is the largest model size mentioned in the article, in terms of number of parameters?', 'query': "MATCH (m:Concept {id: 'Large Language Model'}) RETURN max(m.parameters) AS largest_model"}] example_prompt=PromptTemplate(input_variables=['query', 'question'], template='User input: {question}\nCypher query: {query}') suffix='User input: {question}\nCypher query: ' prefix='You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\n Below are a number of examp