# **Requirements**

In [1]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.8 (from langchain_community)
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain_community)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [2]:
pip install neo4j

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.27.0


In [3]:
token = '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'

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `firstToken` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authent

In [5]:
import torch
import transformers
from transformers import AutoTokenizer
from  langchain import LLMChain, HuggingFacePipeline, PromptTemplate

In [6]:
from langchain import PromptTemplate

In [7]:
from typing import Any, Dict, List, Optional
from neo4j import GraphDatabase, exceptions
import os

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [9]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

## **Connecting to Neo4j EC2 instance**

In [10]:
node_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
RETURN {labels: nodeLabels, properties: properties} AS output

"""

rel_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "relationship"
WITH label AS nodeLabels, collect({property:property, type:type}) AS properties
RETURN {type: nodeLabels, properties: properties} AS output
"""

rel_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE type = "RELATIONSHIP" AND elementType = "node"
RETURN "(:" + label + ")-[:" + property + "]->(:" + toString(other[0]) + ")" AS output
"""


def schema_text(node_props, rel_props, rels) -> str:
    return f"""
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  {node_props}
  Relationship properties are the following:
  {rel_props}
  The relationships are the following
  {rels}
  """



In [11]:
class Neo4jDatabase:
    def __init__(
        self,
        host: str = "neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687",
        user: str = "neo4j",
        password: str = "neo4jBigData",
        database: str = "neo4j",
        read_only: bool = True,
    ) -> None:
        """Initialize a neo4j database"""
        self._driver = GraphDatabase.driver(host, auth=(user, password))
        self._database = database
        self._read_only = read_only
        self.schema = ""
        # Verify connection
        try:
            self._driver.verify_connectivity()
        except exceptions.ServiceUnavailable:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the url is correct"
            )
        except exceptions.AuthError:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the username and password are correct"
            )
        try:
            self.refresh_schema()
        except:
            raise ValueError("Missing APOC Core plugin")

    @staticmethod
    def _execute_read_only_query(tx, cypher_query: str, params: Optional[Dict] = {}):
        result = tx.run(cypher_query, params)
        return [r.data() for r in result]

    def query(
        self, cypher_query: str, params: Optional[Dict] = {}
    ) -> List[Dict[str, Any]]:
        with self._driver.session(database=self._database) as session:
            try:
                if self._read_only:
                    result = session.read_transaction(
                        self._execute_read_only_query, cypher_query, params
                    )
                    return result
                else:
                    result = session.run(cypher_query, params)
                    # Limit to at most 10 results
                    return [r.data() for r in result]

            # Catch Cypher syntax errors
            except exceptions.CypherSyntaxError as e:
                return [
                    {
                        "code": "invalid_cypher",
                        "message": f"Invalid Cypher statement due to an error: {e}",
                    }
                ]

            except exceptions.ClientError as e:
                # Catch access mode errors
                if e.code == "Neo.ClientError.Statement.AccessMode":
                    return [
                        {
                            "code": "error",
                            "message": "Couldn't execute the query due to the read only access to Neo4j",
                        }
                    ]
                else:
                    return [{"code": "error", "message": e}]

    def refresh_schema(self) -> None:
        node_props = [el["output"] for el in self.query(node_properties_query)]
        rel_props = [el["output"] for el in self.query(rel_properties_query)]
        rels = [el["output"] for el in self.query(rel_query)]
        schema = schema_text(node_props, rel_props, rels)
        self.schema = schema
        print(schema)

    def check_if_empty(self) -> bool:
        data = self.query(
            """
        MATCH (n)
        WITH count(n) as c
        RETURN CASE WHEN c > 0 THEN true ELSE false END AS output
        """
        )
        return data[0]["output"]


In [12]:
graph = Neo4jDatabase(host=os.environ.get("NEO4J_URL", "neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687"),
    user=os.environ.get("NEO4J_USER", "neo4j"),
    password=os.environ.get("NEO4J_PASS", "neo4jBigData"),
    database=os.environ.get("NEO4J_DATABASE", "neo4j"),)

  result = session.read_transaction(



  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 'type': 'STRING'}], 'type': 'PART_OF'}, {'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 'type': 'STRING'}], 'type': 'METHOD_OF'}, {'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 'type': 'STRING'}], 'type': 'LOCATION_OF'}, {'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 'type': 'STRING'}], 'type': 'TREATS'}, {'properties': [{'property': 'sentence',

In [13]:
schema = graph.schema

# **Initializing final QA LLM**

In [14]:
model1 = "meta-llama/Meta-Llama-3-8B"

In [15]:
tokenizer1 = AutoTokenizer.from_pretrained(model1, use_auth_token=token)



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [16]:
pipeline1 = transformers.pipeline(
    "text-generation",
    model=model1,
    tokenizer=tokenizer1,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    max_length=10000,
    device_map = 'auto',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer1.eos_token_id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]



In [17]:
template = """
              Note: Using the context provided below, answer the question that follows. Only generate a synthesized response based on the context, avoiding verbatim repetition of the data provided.
              Answer the following question delimited by triple backticks, and this is the context you should take into account while answering extracted from a trusted database: {context}
              Question:
              ```{question}```
              ANSWER:
           """

In [18]:
final_llm = HuggingFacePipeline(pipeline = pipeline1, model_kwargs = {'temperature':0})

  final_llm = HuggingFacePipeline(pipeline = pipeline1, model_kwargs = {'temperature':0})


In [19]:
prompt1 = PromptTemplate(template=template, input_variables=["context","question"])
llm_qa_chain = LLMChain(prompt=prompt1, llm=final_llm)

  llm_qa_chain = LLMChain(prompt=prompt1, llm=final_llm)


# **Initializing Text2Cypher LLM**

In [36]:
# model = "meta-llama/Llama-2-7b-chat-hf"
# model = "meta-llama/Meta-Llama-3-8B"
model = "tomasonjo/text2cypher-demo-16bit"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=token)



tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [38]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    max_length=10000,
    device_map = 'auto',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]



In [39]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

In [40]:
examples = [
    {
        "question": "What entities are part of the brain?",
        "query": """
        MATCH (e:Subject)-[r:PART_OF]->(b:Object {name: 'Brain'}) RETURN e.name, e.semtytype, r.sentence, r.stype """
    },
    {
        "question": "Which therapeutic procedures are used for treating diseases that can also be prevented by 'Vaccines'?",
        "query": """
        MATCH (t:Subject {semtype: 'topp'})-[r1:TREATS]->(d:Subject)-[r2:PREVENTS]->(:Object {name: 'Vaccines'})
        RETURN t.name, d.name, r1.sentence, r2.sentence, r1,stype, r2.stype,
        """
    },
    {
        "question": "What stimulates Testesterone?",
        "query": """MATCH (s:Subsject)-[r:STIMULATES]->(t:Object {name: 'Testosterone'})
RETURN s.name , s.semtype, r.sentence, r.stype """
    },
    {
        "question": "Therapeutic procedures help with treating what diseases?",
        "query": """
        MATCH (t:Subject {semtype: 'topp'})-[r:TREATS]->(d:Object {semtype: 'dsyn'})
RETURN t.name, d.name, r.stype, r.sentence
        """
    },
    {
        "question": "List titles of 5 research papers you know about treating diseases",
        "query": """
        MATCH ()-[r:TREATS]->()
        WHERE r.stype = 'ti'
        RETURN r.sentennce
        LIMIT 5
        """
    },
    {
        "question": "Can you list titles of research papers you know about?",
        "query": """ MATCH ()-[r]->() WHERE r.stype = 'ti' RETURN r.sentence LIMIT 5 """,
    }
]


In [41]:
def get_similar_examples(examples, user_query):
  questions = [ex['question'] for ex in examples]

  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(questions + [user_query])

  cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
  top_k_indices = np.argsort(cosine_sim)[-2:][::-1]

  selected_examples = [examples[i] for i in top_k_indices]

  return selected_examples



In [42]:
user_query = "What entities are part of kidney?"

In [43]:
see = get_similar_examples(examples, user_query)

In [44]:
see

[{'question': 'What entities are part of the brain?',
  'query': "\n        MATCH (e:Subject)-[r:PART_OF]->(b:Object {name: 'Brain'}) RETURN e.name, e.semtytype, r.sentence, r.stype "},
 {'question': 'What stimulates Testesterone?',
  'query': "MATCH (s:Subsject)-[r:STIMULATES]->(t:Object {name: 'Testosterone'})\nRETURN s.name , s.semtype, r.sentence, r.stype "}]

In [45]:
def get_system_message() -> str:
        system = """
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        """
        system += """
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: {schema}
        """

        system += """
        You need to follow these Cypher examples when you are constructing a Cypher statement:
        {examples}
        """
        # Add note at the end and try to prevent LLM injections
        system += """Note: Do not include any explanations or apologies in your responses.
                     I repeat do not use any other relationship types or properties that are not provided in the schema, try to match from whatever is present in the database schema provided.
                     Do not include any text except the generated Cypher statement that is being asked to generate. This is very important if you want to get paid.
                     Please wrap the generated Cypher statement in triple backticks (`).
                     """
        return system

In [46]:
prompt = get_system_message()

In [47]:
sys_prompt: PromptTemplate = PromptTemplate(
    input_variables=["schema", "examples"],
    template= prompt )

system_message_prompt = SystemMessagePromptTemplate(prompt=sys_prompt)

In [48]:
user_prompt: PromptTemplate = PromptTemplate(
    input_variables=["question"],
    template="{question}. Can you provide a Cypher query to achieve this?"
)
user_message_prompt = HumanMessagePromptTemplate(prompt=user_prompt)

In [49]:
chat_prompt = ChatPromptTemplate.from_messages(
    [system_message_prompt, user_message_prompt])

In [50]:
import re
def extract_text_between_backticks(text):
    # Define the regular expression pattern
    pattern = r'```(.*?)```'

    # Find all matches
    matches = re.findall(pattern, text, re.DOTALL)

    return matches

# **Test Query1**

**Retriving relevant informajion from the database (Neo4j EC2)**

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
chain =  LLMChain(prompt=chat_prompt, llm=llm)

question = "Can you list titles of research papers you know about?"
add = get_similar_examples(examples, question)

result = chain.run({
    "question": question,
    "schema": schema,
    "examples": add
})

print("Generated Cypher Query:", result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Cypher Query: System: 
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: 
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}, {'property': 'ID', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}

In [None]:
def extract_cypher_query(result):
  pattern = r"`([^`]*)`"
  match = re.search(pattern, result)
  matches = re.findall(pattern, result)
  if match:
    cypher_query = matches[-1]
  return cypher_query

In [None]:
import re

query1 = extract_cypher_query(result)
print(query1)


MATCH ()-[r]->() WHERE r.stype = 'ti' RETURN r.sentence LIMIT 5


In [None]:
from neo4j import GraphDatabase

def execute_cypher_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    try:
        with driver.session() as session:
            result = session.run(query)
            return [record["r.sentence"] for record in result]
    finally:
        pass
        #driver.close()

results = execute_cypher_query(uri="neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687", user="neo4j", password="neo4jBigData", query="MATCH ()-[r]->() WHERE r.stype = 'ti' RETURN r.sentence LIMIT 5")


In [None]:
print("Query Results:")
for idx, sentence in enumerate(results, start=1):
    print(f"{idx}. {sentence}")

Query Results:
1. Therapeutic and toxic effects observed with different dosage programs of cyclophosphamide in treatment of steroid-responsive but frequently relapsing nephrotic syndrome.
2. Torn collateral ligament of thumb.
3. Advances in anesthesia for plastic surgery in burns.
4. [Evolution of the therapy of carcinoma of the uterus and its history].
5. The deleterious effect of immediate postoperative prothesis in below-knee amputation for ischemic disease.


**Agmentation ang Generation**

In [None]:
response= llm_qa_chain.run({'question': question, 'context': results})

  response= llm_qa_chain.run({'question': question, 'context': results})
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
print(response)


              Answer the following question delimited by triple backticks, and this is the context you should take into account while answering extracted from a trusted database: ['Therapeutic and toxic effects observed with different dosage programs of cyclophosphamide in treatment of steroid-responsive but frequently relapsing nephrotic syndrome.', 'Torn collateral ligament of thumb.', 'Advances in anesthesia for plastic surgery in burns.', '[Evolution of the therapy of carcinoma of the uterus and its history].', 'The deleterious effect of immediate postoperative prothesis in below-knee amputation for ischemic disease.'] 
              ```Can you list titles of research papers you know about?```
              ANSWER:
            ```[1] "Advances in anesthesia for plastic surgery in burns.", [2] "Therapeutic and toxic effects observed with different dosage programs of cyclophosphamide in treatment of steroid-responsive but frequently relapsing nephrotic syndrome.", [3] "Torn collater

# **Test Query2**

**Retrival**

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
chain =  LLMChain(prompt=chat_prompt, llm=llm)

question = "What entities are part of kidney?"
add = get_similar_examples(examples, question)

result = chain.run({
    "question": question,
    "schema": schema,
    "examples": add
})

print("Generated Cypher Query:", result)
query2 = extract_cypher_query(result)
print(query2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Cypher Query: System: 
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: 
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}

NameError: name 'extract_cypher_query' is not defined

In [None]:
print(result)

System: 
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: 
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 

In [None]:
query2 = """MATCH (e:Subject)-[r:PART_OF]->(k:Object {name: 'Kidney'}) RETURN e.name, e.type, r.sentence, r.stype"""

In [None]:
from neo4j import GraphDatabase

def execute_cypher_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    try:
        with driver.session() as session:
            result = session.run(query)
            return [dict(record) for record in result]
    finally:
        pass
        #driver.close()

results = execute_cypher_query(uri="neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687", user="neo4j", password="neo4jBigData", query=query2)




In [None]:
print(results)

[{'e.name': 'Transferase', 'e.type': None, 'r.sentence': 'Castration of adult male mice reduced the ability of the transferase factors of the kidney to stimulate amino acid incorporation by polysomes in vitro.', 'r.stype': 'ab'}, {'e.name': 'Entire sympathetic nerve', 'e.type': None, 'r.sentence': 'Renin release and renal sympathetic nerve activity following vertebral artery embolism.', 'r.stype': 'ti'}, {'e.name': 'Kidney Tubules and Proximal', 'e.type': None, 'r.sentence': '[Electron microscopic studies on membrane-phosphatases and lysosomes in the proximal tubule of the rat kidney after application of folic acid].', 'r.stype': 'ti'}, {'e.name': 'Collagen Type IV', 'e.type': None, 'r.sentence': 'In ELISA inhibition assay, human kidney type IV collagen (HKIVC) only partially inhibited the binding of MKIVC to anti-MKIVC antiserum.', 'r.stype': 'ab'}, {'e.name': 'Endopeptidases|ERVK-7|ERVK-8|ERVK-10|ERVK-9|ERVK-21|ERVK-18|ERVK-25|ERVK-24|ERVK-19', 'e.type': None, 'r.sentence': 'Sex-rela

In [None]:
result_types = [[result['e.name'], result['r.sentence']] for result in results]

**Augmentation and Generation**

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
response2 = llm_qa_chain.run({'question': question, 'context': result_types[0:4]})

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
print(response2)


              Note: Using the context provided below, answer the question that follows. Only generate a synthesized response based on the context, avoiding verbatim repetition of the data provided.
              Answer the following question delimited by triple backticks, and this is the context you should take into account while answering extracted from a trusted database: [['Transferase', 'Castration of adult male mice reduced the ability of the transferase factors of the kidney to stimulate amino acid incorporation by polysomes in vitro.'], ['Entire sympathetic nerve', 'Renin release and renal sympathetic nerve activity following vertebral artery embolism.'], ['Kidney Tubules and Proximal', '[Electron microscopic studies on membrane-phosphatases and lysosomes in the proximal tubule of the rat kidney after application of folic acid].'], ['Collagen Type IV', 'In ELISA inhibition assay, human kidney type IV collagen (HKIVC) only partially inhibited the binding of MKIVC to anti-MKIVC a

# **Test Query3**

**Retrival**

In [None]:
chain =  LLMChain(prompt=chat_prompt, llm=llm)

question = "What substances stimulate enzymes?"
add = get_similar_examples(examples, question)

result = chain.run({
    "question": question,
    "schema": schema,
    "examples": add
})

  result = chain.run({
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
print(result)

System: 
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: 
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 

In [None]:
query3 = """ MATCH (s:Subject)-[r:STIMULATES]->(e:Object {semtype: 'enzymes'})\nRETURN s.name, s.semtype, r.sentence, r.stype"""

In [20]:
#changes needed :
query3test = """ MATCH (s:Subject)-[r:STIMULATES]->(e:Object {name: 'Enzymes'})\nRETURN s.name, s.semtype, r.sentence, r.stype"""

In [21]:
from neo4j import GraphDatabase

def execute_cypher_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    try:
        with driver.session() as session:
            result = session.run(query)
            return [dict(record) for record in result]
    finally:
        pass
        #driver.close()

results = execute_cypher_query(uri="neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687", user="neo4j", password="neo4jBigData", query=query3test)


In [22]:
print(results)

[{'s.name': 'Corticotropin|POMC', 's.semtype': 'horm', 'r.sentence': 'It is demonstrated that the tryptophan residue of the ACTH molecule is essential for stimulation of the enzyme.', 'r.stype': 'ab'}, {'s.name': 'human leukocyte interferon', 's.semtype': 'phsu', 'r.sentence': "In parallel, RA or dbcAMP also enhanced the level of 2'-5'-oligoadenylate (2-5A) synthetase, and enzyme induced by IFNs and implicated in their biological action.", 'r.stype': 'ab'}]


**Augmentation and Generation**

In [27]:
results

[{'s.name': 'Corticotropin|POMC',
  's.semtype': 'horm',
  'r.sentence': 'It is demonstrated that the tryptophan residue of the ACTH molecule is essential for stimulation of the enzyme.',
  'r.stype': 'ab'},
 {'s.name': 'human leukocyte interferon',
  's.semtype': 'phsu',
  'r.sentence': "In parallel, RA or dbcAMP also enhanced the level of 2'-5'-oligoadenylate (2-5A) synthetase, and enzyme induced by IFNs and implicated in their biological action.",
  'r.stype': 'ab'}]

In [24]:
question = "What substances stimulate enzymes?"

In [25]:
response3 = llm_qa_chain.run({'question': question, 'context': results})

  response2 = llm_qa_chain.run({'question': question, 'context': results})
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [26]:
print(response3)


              Note: Using the context provided below, answer the question that follows. Only generate a synthesized response based on the context, avoiding verbatim repetition of the data provided.
              Answer the following question delimited by triple backticks, and this is the context you should take into account while answering extracted from a trusted database: [{'s.name': 'Corticotropin|POMC', 's.semtype': 'horm', 'r.sentence': 'It is demonstrated that the tryptophan residue of the ACTH molecule is essential for stimulation of the enzyme.', 'r.stype': 'ab'}, {'s.name': 'human leukocyte interferon', 's.semtype': 'phsu', 'r.sentence': "In parallel, RA or dbcAMP also enhanced the level of 2'-5'-oligoadenylate (2-5A) synthetase, and enzyme induced by IFNs and implicated in their biological action.", 'r.stype': 'ab'}]
              Question:
              ```What substances stimulate enzymes?```
              ANSWER:
           


# **Test Query4**

**Retrival**

In [None]:
chain =  LLMChain(prompt=chat_prompt, llm=llm)

question = "Efferent fibers are part of which body part?"
add = get_similar_examples(examples, question)

result = chain.run({
    "question": question,
    "schema": schema,
    "examples": add
})

In [None]:
print(result)

System: 
        Your task is to convert questions about contents in a Neo4j database to Cypher queries to query the Neo4j database.(You are a Cypher query expert)
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema: 
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [{'labels': 'Subject', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}, {'labels': 'Object', 'properties': [{'property': 'ID', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'semtype', 'type': 'STRING'}]}]
  Relationship properties are the following:
  [{'properties': [{'property': 'sentence', 'type': 'STRING'}, {'property': 'stype', 

In [28]:
query4= """MATCH (e:Subject {name: 'efferent fiber'})-[r:PART_OF]->(b:Object) RETURN b.name """

In [29]:
results = execute_cypher_query(uri="neo4j://ec2-54-211-13-125.compute-1.amazonaws.com:7687", user="neo4j", password="neo4jBigData", query=query4)

In [30]:
print(results)

[{'b.name': 'Neostriatum'}]


**Augmentation and Generation**

In [33]:
response4 = llm_qa_chain.run({'question': question, 'context': results})

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [34]:
print(response4)


              Note: Using the context provided below, answer the question that follows. Only generate a synthesized response based on the context, avoiding verbatim repetition of the data provided.
              Answer the following question delimited by triple backticks, and this is the context you should take into account while answering extracted from a trusted database: [{'b.name': 'Neostriatum'}]
              Question:
              ```Efferent fibers are part of which body part?```
              ANSWER:
            Neostriatum
            ```Efferent fibers are part of which body part?```



# **Test Query5**

**Retrival**

In [None]:
chain =  LLMChain(prompt=chat_prompt, llm=llm)

question = "What is Triamcinolone?"
add = get_similar_examples(examples, question)

result = chain.run({
    "question": question,
    "schema": schema,
    "examples": add
})

In [None]:
semantic_types = {
    # Core Semantic Types for Diseases and Treatments
    "dsyn": {
        "full_form": "Disease or Syndrome",
        "description": "Represents diseases, syndromes, and medical conditions.",
        "examples": ["Diabetes", "Hypertension", "COVID-19"]
    },
    "topp": {
        "full_form": "Therapeutic or Preventive Procedure",
        "description": "Represents treatments, surgeries, and preventive measures.",
        "examples": ["Chemotherapy", "Vaccination"]
    },
    "phsu": {
        "full_form": "Pharmacologic Substance",
        "description": "Represents medicines, drugs, and pharmacological substances used for treatments.",
        "examples": ["Ibuprofen", "Insulin"]
    },
    "antb": {
        "full_form": "Antibiotic",
        "description": "Represents antibiotics specifically used to treat bacterial infections.",
        "examples": ["Amoxicillin", "Azithromycin"]
    },
    "horm": {
        "full_form": "Hormone",
        "description": "Represents hormones used in treatment or related to disease processes.",
        "examples": ["Estrogen", "Thyroxine"]
    },
    "vita": {
        "full_form": "Vitamin",
        "description": "Represents vitamins used in disease treatment or prevention.",
        "examples": ["Vitamin D", "Vitamin C"]
    },

    # Supporting Semantic Types for Contextual Information
    "patf": {
        "full_form": "Pathologic Function",
        "description": "Represents pathological processes associated with diseases.",
        "examples": ["Inflammation", "Neoplasia"]
    },
    "bpoc": {
        "full_form": "Body Part, Organ, or Organ Component",
        "description": "Represents anatomical structures affected by disease or targeted by treatment.",
        "examples": ["Heart", "Lungs"]
    },
    "phpr": {
        "full_form": "Physiologic Process",
        "description": "Represents physiological processes relevant to disease progression or treatment.",
        "examples": ["Blood clotting", "Inflammation"]
    },
    "bmod": {
        "full_form": "Biomedical Occupation or Discipline",
        "description": "Represents medical disciplines involved in disease treatment.",
        "examples": ["Oncology", "Cardiology"]
    },
    "hlca": {
        "full_form": "Health Care Activity",
        "description": "Represents activities in health care related to treatment or disease management.",
        "examples": ["Medical consultations", "Diagnostics"]
    },

    # Types for Substances and Mechanisms
    "chem": {
        "full_form": "Chemical",
        "description": "Represents broader chemical compounds, including active ingredients in medicines.",
        "examples": ["Acetaminophen", "Sodium chloride"]
    },
    "aapp": {
        "full_form": "Amino Acid, Peptide, or Protein",
        "description": "Represents biomolecules involved in diseases or targeted by treatments.",
        "examples": ["Insulin"]
    },
    "enzy": {
        "full_form": "Enzyme",
        "description": "Represents enzymes that may play a role in disease mechanisms or treatments.",
        "examples": ["Protease inhibitors"]
    },

    # Additional Types for Research and Supporting Information
    "neop": {
        "full_form": "Neoplastic Process",
        "description": "Represents cancerous and precancerous conditions.",
        "examples": ["Breast Cancer", "Leukemia"]
    },
    "genf": {
        "full_form": "Genetic Function",
        "description": "Represents genetic components of diseases and treatments involving gene therapy.",
        "examples": ["Mutations in BRCA1/BRCA2"]
    },
    "mcha": {
        "full_form": "Machine or Device",
        "description": "Represents devices used in treatment, such as ventilators or insulin pumps.",
        "examples": ["Dialysis machine"]
    }
}
