In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
pip install -U langchain-community langchain_openai chromadb neo4j

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.graphs import Neo4jGraph
import openai
import os
from sentence_transformers import CrossEncoder
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
# Setup API for LLM and text embedding model
API_KEY = 'API_KEY'
os.environ['OPENAI_API_KEY'] = API_KEY

In [None]:
CHROMA_PATH = '/content/drive/MyDrive/FinScope3D/Unstructured_Data/chroma_db'

In [None]:
# Load Chroma database
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)

In [None]:
# Load neo4j database
URI = "YOUR_URI"
USER = 'neo4j'
PASSWORD = 'YOUR_PASSWORD'
graph = Neo4jGraph(url=URI, username=USER, password=PASSWORD)

In [None]:
# Setup cross-encoder for re-ranking similarity search results
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Setup LLM
model = ChatOpenAI()

**Input Query**

In [None]:
query = input("Question: '")

**GraphRAG**

In [None]:
cypher_prompt = PromptTemplate(
    template="Convert the following natural language query into a Cypher query for a neo4j knowledge graph: {query}",
    input_variables=["query"],
)
cypher_chain = LLMChain(llm=model, prompt=cypher_prompt)

In [None]:
# Graph result for GraphRAG
generated_cypher_query = cypher_chain.run(query)
graph_results = graph.query(generated_cypher_query)
graph_text = "\n".join([
    ", ".join(f"{key}: {value}" for key, value in record.items())
    for record in graph_results
])

**VectorRAG**

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
results = db.similarity_search_with_relevance_scores(query, k=10)
# results to be further re-ranked
ranked_documents = [
    {
        'page_content': doc.page_content,
        'metadata': doc.metadata,
        'score': score
    }
    for doc, score in results
]
reranker_input = [
    {
        "query": query,
        "page_content": doc["page_content"],
        "metadata": doc["metadata"],
        "original_score": doc["score"]
    }
    for doc in ranked_documents
]

**HybridRAG from both results**

In [None]:
if len(results) == 0 or results[0][1] < 0.7:
  print('Unable to find matching results')
else:
  pairs = [[query, doc['page_content']] for doc in reranker_input]

  # Re-ranking with cross encoder
  rerank_scores = cross_encoder.predict(pairs)
  for i, score in enumerate(rerank_scores):
    reranker_input[i]["rerank_score"] = score

  final_results = sorted(reranker_input, key=lambda x: x["rerank_score"], reverse=True)

  top_k_documents = [
    {"page_content": doc["page_content"], "metadata": doc["metadata"]}
    for doc in final_results[:5]
    ]
  final_docs = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"]) for doc in top_k_documents
    ]
  text_context = '\n\n---\n\n'.join([doc.page_content for doc in final_docs])

  # Combine context from graph and text
  full_context = text_context + "\n\nGraph-Based Knowledge:\n" + graph_text if graph_text else text_context
  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=full_context, question=query)

  response = model.predict(prompt)

  sources = [f"{doc.metadata.get('company_ticker', '')}_{doc.metadata.get('year', None)}" for doc, _score in results]
  formatted_response = f"Response: {response}"

Response: Based on the provided context, Apple's business model involves designing, manufacturing, and marketing smartphones, personal computers, tablets, wearables, accessories, and related services. Their focus is on continually improving their products and services to maintain functional and design advantages, introducing innovative new products, managing frequent product transitions, and expanding market opportunities in various technology sectors. They also compete by emphasizing design innovation, product quality, pricing, service features, and a strong ecosystem of third-party software and accessories. The company faces substantial competition in the market but aims to stimulate customer demand through new product introductions and upgrades.
