### custom function for loading and splitting json data

In [10]:
import json

def split_json_by_quotes(data, chunk_size=3):
    chunks = []
    current_chunk = {}
    current_category = None
    count = 0
    
    for category, quotes in data.items():
        if not current_chunk.get(category):
            current_chunk[category] = []
        
        for quote in quotes:
            current_chunk[category].append(quote)
            count += 1
            
            if count >= chunk_size:
                chunks.append(current_chunk)
                current_chunk = {category: []}
                count = 0
    
    if current_chunk and any(current_chunk.values()):
        chunks.append(current_chunk)
    
    return chunks

# Load JSON data from file
try:
    with open("shayari_data.json", "r", encoding="utf-8") as file:
        shayari_data = json.load(file)

    # Split data into chunks of 3 quotes each
    split_data = split_json_by_quotes(shayari_data, chunk_size=3)

    # for idx, chunk in enumerate(split_data):
    #     print(f"Chunk {idx + 1}:", json.dumps(chunk, ensure_ascii=False, indent=4))

except Exception as e:
    print(f"Error occurred: {e}")


### converting the split data (chunks) into format that is suitable for vector store


In [14]:
from langchain.schema import Document  

# Convert the split data to the Document format
documents = []
for chunk in split_data:
    for category, quotes in chunk.items():
        for quote in quotes:
            # Create a text representation of each quote
            text_content = f"{quote['quote']} - {quote['author']}"
            documents.append(Document(page_content=text_content, metadata={"category": category}))

In [18]:
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="llama3.2"
)
vector_store = Chroma.from_documents(documents=documents, embedding= embed)
print(len(vector_store))


66

In [23]:
retriever = vector_store.as_retriever()


In [24]:

from langchain_ollama import ChatOllama
local_llm = "llama3.2"
llm = ChatOllama(model=local_llm, temperature=0)

In [25]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {question}""")
parser = StrOutputParser()

In [26]:
from langchain_core.runnables import RunnablePassthrough

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
rag_chain.invoke("fetch some shayaris from context")


'Here are a few shayaris (poems) in Hindi from the provided context:\n\n1. दोस्त एक दर्द है, दोस्त एक मरहम है - फिराक गोरखपुरी\n2. इश्क की इंतहा अजीब दास्तान है, यह कहां से शुरू, कहां से ख़तम।'