In [21]:
from langchain_community.document_loaders import JSONLoader


In [22]:
loader = JSONLoader(
    file_path='data.json',
    text_content=False,
    jq_schema='.faqs[]')

data = loader.load()

In [23]:
max_length_doc = max(data, key=lambda doc: len(doc.page_content))

print(len(max_length_doc.page_content))

376


In [24]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

# splitter = RecursiveJsonSplitter(max_chunk_size=300)
splitter = CharacterTextSplitter(chunk_overlap=0, chunk_size=300)

In [92]:
data

[Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 1}, page_content='{"question": "What is Python?", "tags": ["programming", "python", "basics"], "answer": "Python is a high-level, interpreted programming language known for its simplicity and versatility."}'), Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 2}, page_content='{"question": "What is Pydantic?", "tags": ["python", "validation", "models"], "answer": "Pydantic is a Python library for data validation and settings management using Python type annotations."}'), Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 3}, page_content='{"question": "How do I create a virtual environment in Python?", "tags": ["python", "virtualenv", "environment"], "answer": "You can create a virtual environment in Python by running `python -m venv <env_name>` and activating it using the respective command for your ope

In [93]:
texts = splitter.split_documents(data)

In [40]:
texts

[Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 1}, page_content='{"question": "What is Python?", "tags": ["programming", "python", "basics"], "answer": "Python is a high-level, interpreted programming language known for its simplicity and versatility."}'), Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 2}, page_content='{"question": "What is Pydantic?", "tags": ["python", "validation", "models"], "answer": "Pydantic is a Python library for data validation and settings management using Python type annotations."}'), Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 3}, page_content='{"question": "How do I create a virtual environment in Python?", "tags": ["python", "virtualenv", "environment"], "answer": "You can create a virtual environment in Python by running `python -m venv <env_name>` and activating it using the respective command for your ope

In [25]:
import os
from dotenv import load_dotenv

load_dotenv()
COHERE_API = os.environ.get("COHERE_API")
SUPABASE_SERVICE_KEY=os.environ.get("SUPABASE_SERVICE_KEY")
SUPABASE_URL="https://uggjxkgfewdwgxbdivox.supabase.co"
GROQ_API_KEY=os.environ.get("GROQ_API_KEY")



In [34]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(cohere_api_key=COHERE_API,model="embed-english-v3.0")


In [27]:
from supabase.client import Client, create_client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)

In [97]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(
    texts
)
bm25_retriever.k = 2
vector_store = SupabaseVectorStore.from_documents(
    texts,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_store.as_retriever()], weights=[0.5, 0.5]
)

In [98]:
query = "What does pydantic do?"
matched_docs = vector_store.similarity_search_with_relevance_scores(query)

print(matched_docs[0])



(Document(metadata={'source': '/Users/sidhantsriv/code/projects/yantra/sample.json', 'seq_num': 2}, page_content='{"question": "What is Pydantic?", "tags": ["python", "validation", "models"], "answer": "Pydantic is a Python library for data validation and settings management using Python type annotations."}'), 0.654289083369411)


In [99]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    model="mixtral-8x7b-32768",
    temperature=0.0,
    max_retries=2,
)

In [100]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain import hub
# prompt_template = """As a {persona}, use the following pieces of context to answer the question at the end. 
# If you don't know the answer, just say that you don't know, don't try to make up an answer or give anything other than the relevant information. 
# {context} 
# Question: {query} 
# Helpful Answer:"""

# PROMPT = PromptTemplate(
#     template=prompt_template,
#     input_variables=["persona", "context", "query"]
# )

PROMPT = hub.pull("rlm/rag-prompt")

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",retriever=ensemble_retriever, chain_type_kwargs={"prompt": PROMPT})



In [101]:
response = qa({
    "query": "What is pydantic",

    "context": "You are an FAQ assistant. Do not give anything irrelevant. If it doesn't exist in the context, do not return it"
})
print(response)

{'query': 'What is pydantic', 'context': "You are an FAQ assistant. Do not give anything irrelevant. If it doesn't exist in the context, do not return it", 'result': "Pydantic is a Python library that uses type annotations for data validation and settings management. It's not a basic programming language like Python, but rather a tool to ensure data accuracy and structure in your Python code, similar to how JSON is a lightweight data format for data interchange."}


In [86]:
response['result']


"Pydantic is a Python library for data validation and settings management, using Python type annotations. It's not the same as Python, which is a high-level programming language, or JSON, a lightweight data-interchange format, but it can be used with both for handling data in a structured and controlled way."

In [20]:
from langchain.document_loaders import TextLoader

text_loader = TextLoader("./context.txt")
texts = text_loader.load()
splitter_r = RecursiveCharacterTextSplitter(chunk_size=400)
d = splitter.transform_documents(texts)


Created a chunk of size 324, which is longer than the specified 300
Created a chunk of size 322, which is longer than the specified 300


6

In [38]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
vector_store = SupabaseVectorStore.from_documents(
    d,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)

  vector_store = SupabaseVectorStore.from_documents(


In [31]:
vector_store.add_documents(d)

AttributeError: 'coroutine' object has no attribute 'add_documents'