In [1]:
from pathlib import Path
from typing import List
from langchain.chains.openai_functions import create_structured_output_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.graphs import Neo4jGraph
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import TokenTextSplitter
from langchain_core.pydantic_v1 import BaseModel, Field
import os
import time
from langchain.llms import Ollama 
from langchain.document_loaders import WebBaseLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import GPT4AllEmbeddings 
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [2]:
graph = Neo4jGraph(
    url='bolt://localhost:7687',
    username='neo4j',
    password='password',
)

In [3]:
ollama = Ollama(base_url='http://localhost:11434', model='llama2')

In [4]:
system_1 = WebBaseLoader('https://www.thoughtworks.com/en-de/insights/articles/data-mesh-in-practice-technology-and-the-architecture')
system_2 = WebBaseLoader('https://www.thoughtworks.com/en-de/insights/articles/data-mesh-in-practice-organizational-operating-model')
system_3 = WebBaseLoader('https://www.thoughtworks.com/en-de/insights/articles/data-mesh-in-practice-product-thinking-and-development')
loader = WebBaseLoader('https://www.thoughtworks.com/en-in/insights/blog/data-strategy/dev-experience-data-mesh-product')
loader_1 = WebBaseLoader('https://www.thoughtworks.com/en-in/insights/blog/data-strategy/dev-experience-data-mesh-platform')
raw_documents = loader_1.load() + loader.load() + system_1.load() + system_2.load() + system_3.load()

# Normal RAG

In [5]:
from IPython.display import JSON
from trulens_eval import TruChain, Feedback,Tru
tru = Tru()
tru.reset_database()
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate, ChatPromptTemplate

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [6]:
from langchain.prompts import PromptTemplate
QA_CHAIN_PROMPT = PromptTemplate.from_template("""Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.  

RULES:                                               
DO NOT INCLUDE THE INFORMATION IN YOUR ANSWER.

CONTEXT: 
{context}

Question:
{question}

""")

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
all_splits = text_splitter.split_documents(raw_documents)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())
qachain_full_stuff = RetrievalQA.from_chain_type(llm=ollama, chain_type="stuff", retriever=vectorstore.as_retriever(),chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [8]:
from trulens_eval import LiteLLM
import litellm
litellm.set_verbose=False
ollama_provider = LiteLLM(model_engine="ollama/llama2", api_base='http://localhost:11434')
relevance = Feedback(ollama_provider.relevance_with_cot_reasons).on_input_output()

✅ In relevance_with_cot_reasons, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance_with_cot_reasons, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [9]:
ollama_provider.relevance_with_cot_reasons("what are tools used in?", "Snowflake,Talend,DBT,Collibra,Monte Carlo,Dataops.live,SOLE,OAMClient libraries")

(0.9, {'reason': 'Criteria: Relevance to the given prompt \n'})

In [12]:
tru_recorder = TruChain(qachain_full_stuff,
    app_id='normal_rag',
    feedbacks=[relevance])

In [13]:
with tru_recorder as recording:
    llm_response = qachain_full_stuff.run("what is data product")
display(llm_response)

'Thank you for providing the context. To answer your question, a data product differs from a data asset in that a data product is created to serve a specific user-driven goal as identified in the Lean Value Tree, while a data asset can be any entity composed of data, such as databases or application output files.\n\nA data product is subject to clearly defined Service Level Objectives (SLOs) and is owned by a single domain or stakeholder, maintained by a single data product team. Additionally, a data product is designed to meet the consumption requirements of specific consumers, who may access or consume it through various channels.\n\nIn contrast, a data asset does not have these specific goals, SLOs, or ownership structures, and its purpose may not be directly tied to meeting the needs of a particular consumer group.\n\nI hope this helps clarify the difference between a data product and a data asset. If you have any further questions, please let me know!'

In [14]:
tru.get_records_and_feedback(app_ids=[])[0]

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,relevance_with_cot_reasons,relevance_with_cot_reasons_calls,latency,total_tokens,total_cost
0,normal_rag,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RetrievalQA(langchain.chains.retrieval_qa.base),record_hash_7479619b6f5a210c472ccd9ad83ff40b,"""what is data product""","""Thank you for providing the context. To answe...",-,"{""record_id"": ""record_hash_7479619b6f5a210c472...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-01-05T11:46:15.080033"", ""...",2024-01-05T11:46:27.773222,0.9,"[{'args': {'prompt': 'what is data product', '...",12,0,0.0


# Neo4j RAG (Parent Retriever)

In [15]:
# Ingestion of data in the neo4j graph
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import WikipediaLoader


parent_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=24)
child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)

parent_documents = parent_splitter.split_documents(raw_documents)

for d in parent_documents:
    child_documents = child_splitter.split_documents([d])
    parent_text = d.page_content
    child_texts = [c.page_content for c in child_documents]

    # Create parent node and child nodes with relationships
    graph.query(
        """
        UNWIND $children AS child
        CREATE (p:Parent {text: $parent})
        CREATE (c:Child {text: child})
        CREATE (c)-[:HAS_PARENT]->(p)
        """,
        {"parent": parent_text, "children": child_texts}
    )


KeyboardInterrupt: 

In [16]:
from langchain.vectorstores.neo4j_vector import Neo4jVector

retrieval_query = """
MATCH (node)-[:HAS_PARENT]->(parent)
RETURN parent.text AS text, score, {} AS metadata
"""

from langchain.embeddings import (
    OllamaEmbeddings,
    SentenceTransformerEmbeddings,
    BedrockEmbeddings,
)

vector_index = Neo4jVector.from_existing_graph(
    GPT4AllEmbeddings(),
    url='bolt://localhost:7687',
    username='neo4j',
    password='password',
    index_name="new_index_name",
    node_label="Child",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    retrieval_query=retrieval_query,
)

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [17]:
qachain = RetrievalQA.from_chain_type(llm=ollama, chain_type="stuff", retriever=vector_index.as_retriever(),chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

In [18]:
tru_recorder = TruChain(qachain,
    app_id='neo4j_parental_rag',
    feedbacks=[relevance])

In [19]:
with tru_recorder as recording:
    llm_response = qachain.run("what is data product")
display(llm_response)

'A data product is a structured set of data that provides value to a particular stakeholder or use case. It can be thought of as a "productized" data asset, meaning that it has been organized and packaged in a way that makes it easy for consumers to access, consume, and utilize.\n\nData products typically have the following characteristics:\n\n1. Defined scope and purpose: The data product has a clear definition of what it is intended to do and who it is intended to serve.\n2. Structured data: The data product is organized in a way that makes it easy for consumers to find and use the data they need. This may involve breaking down large datasets into smaller, more manageable chunks or creating new data entities to represent specific concepts or relationships.\n3. Consistent and reliable: The data product is designed to be consistent and reliable, so that consumers can rely on it to provide accurate and up-to-date information.\n4. Valuable: The data product provides value to the consumer

In [20]:
tru.get_records_and_feedback(app_ids=[])[0]

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,relevance_with_cot_reasons,relevance_with_cot_reasons_calls,latency,total_tokens,total_cost
0,normal_rag,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RetrievalQA(langchain.chains.retrieval_qa.base),record_hash_7479619b6f5a210c472ccd9ad83ff40b,"""what is data product""","""Thank you for providing the context. To answe...",-,"{""record_id"": ""record_hash_7479619b6f5a210c472...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-01-05T11:46:15.080033"", ""...",2024-01-05T11:46:27.773222,0.9,"[{'args': {'prompt': 'what is data product', '...",12,0,0.0
1,neo4j_parental_rag,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RetrievalQA(langchain.chains.retrieval_qa.base),record_hash_9744ee3913a215d5b877d6ffd22f89c8,"""what is data product""","""A data product is a structured set of data th...",-,"{""record_id"": ""record_hash_9744ee3913a215d5b87...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-01-05T11:48:04.136092"", ""...",2024-01-05T11:48:43.352390,,,12,0,0.0


In [21]:
tru.start_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.6:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [23]:
tru.stop_dashboard()

# Hallucinations

In [31]:
tru_recorder = TruChain(qachain,
    app_id='neo4j_parental_rag',
    feedbacks=[relevance])

In [22]:
with tru_recorder as recording:
    llm_response = qachain.run("what is life")
display(llm_response)

'I don\'t have personal experiences or knowledge about the specific context you provided, but I can try to help you understand it better.\n\nBased on the information provided in the article, "Data Mesh in practice: Technology and architecture (Part IV)" by Ammara Gafoor, Ian Murdoch, and Kiran Prakash, the authors discuss their learnings from implementing Data Mesh at Roche. They highlight the importance of technology and architecture in supporting data mesh practices.\n\nIn the article, the authors mention that Data Mesh is built on top of a robust technology stack, which includes Snowflake for storage, Compute for computation, and Monte Carlo for monitoring. They also note that the platform provides an isolation layer between the application and the underlying infrastructure, which allows for more agile development and faster time-to-market for data products.\n\nThe authors also discuss the importance of having a well-thought-out architecture to support data mesh practices. They ment

In [23]:
tru_recorder = TruChain(qachain_full_stuff,
    app_id='normal_rag',
    feedbacks=[relevance])

In [24]:
with tru_recorder as recording:
    llm_response = qachain_full_stuff.run("what is life")
display(llm_response)

A new object of type <class 'langchain.chains.retrieval_qa.base.RetrievalQA'> at 0x2905f4280 is calling an instrumented method <function Chain.__call__ at 0x11f6bc5e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x2c40d1950) using this function.
A new object of type <class 'langchain.chains.retrieval_qa.base.RetrievalQA'> at 0x2905f4280 is calling an instrumented method <function BaseRetrievalQA._call at 0x12a60c2c0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x2c40d1950) using this function.
A new object of type <class 'langchain_core.vectorstores.VectorStoreRetriever'> at 0x17f128cd0 is calling an instrumented method <function BaseRetriever.get_relevant_documents at 0x11f697880>. The path of this call may be incorrect.
Guessing path of new object is app.retriever based on other object (0x2c40d1630) using this function.
A new object of type <class 'langchain_core.vectorstores

'Thank you for providing the context. Based on the information provided, I don\'t know the answer to your question "what is life." The context does not provide any information that would help me answer this question. Please let me know if there is anything else I can help you with.'

In [25]:
tru.start_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.1.6:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>