# Installing dependencies

In [None]:
!pip install langchain pypdf chromadb rapidocr-onnxruntime lark tiktoken

# Loading and Preprocessing the data

## Load files from a directory

In [None]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader

DOC_DIR = './commentary_files'

# Load Documents
loader = PyPDFDirectoryLoader(DOC_DIR, extract_images=True)
docs = loader.load()

## Check file metadata

In [None]:
# print(len(docs))
for doc in docs:
    # doc.metadata['title'] = doc.metadata['source'].rsplit('/')[-1]
    # print("--------")
    print(doc.metadata)

## Split the documents into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
for doc in splits:
    if 'keywords' in doc.metadata:
        if type(doc.metadata['keywords']) == list:
            print("Fixing list", doc.metadata['keywords'])
            doc.metadata['keywords'] = ','.join(doc.metadata['keywords'])

## Generate some metadata for each chunk

In [None]:
import re
import json

def extract_json_objects(text):
    # Regex to extract potential JSON objects (not guaranteed to be valid JSON)
    pattern = r'\{[^{}]*\}'
    potential_jsons = re.findall(pattern, text, re.DOTALL)

    # List to store valid JSON objects
    valid_json_objects = []

    # Validate each extracted string as JSON
    for potential_json in potential_jsons:
        try:
            # Attempt to parse the JSON string
            potential_json = re.sub(",[ \t\r\n]+}", "}", potential_json)
            potential_json = re.sub(",[ \t\r\n]+\]", "]", potential_json)
            json_object = json.loads(potential_json)
            # If successful, append to the list of valid JSON objects
            valid_json_objects.append(json_object)
        except json.JSONDecodeError:
            # If JSON is not valid, skip it
            continue
    
    return valid_json_objects

In [None]:
from langchain.chat_models import ChatOllama
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Prompt
template = """Given some input text, summarize it and generate a JSON containing the following keys only:
- topic: A title that describes the content
- summary: A concise and accurate summary of the input document
- keywords: A string of comma-separated semantic keywords associated with the content text

Return the output as a json only.

Text: {text}
"""

# LLM
llm = ChatOllama(model_name="llama2", temperature=1)

prompt = ChatPromptTemplate.from_template(template)
summarizer = prompt | llm | StrOutputParser()

prompt

In [None]:
import json

for idx, doc in enumerate(splits):
    print(f"-------------- Processing chunk {idx}")
    tags = summarizer.invoke(doc.page_content)
    try:
        tags = extract_json_objects(tags)[0]
    except:
        print(f"Received non-json for document {doc.metadata['source']}. \nReceived: {tags}\n\nPlease enter the expected JSON:")
        j = input()
        tags = json.loads(j)
        
    if tags:
        doc.metadata.update(tags)

 {     "topic": "Changes to B3i Sector structure",     "summary": "The author invites the reader to consider modifying their B3i sector structure to take advantage of a new designation for users. The process is straightforward, and the author will guide the reader through it.",     "keywords": "B3i, sector structure, modification, users, designation." }


## Vectorize the chunks (using OpenAI due to a compatibility issue with LangChain and Ollama for Self Query retrievers)

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(), persist_directory="./chroma_db")

retriever = vectorstore.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("What are mega grants?")

In [None]:
print(len(docs))
for doc in docs:
    print("--------")
    print(doc.metadata.keys())
    print(doc.page_content)

## Use this LLM for all below types except Self Query (that one needs OpenAI)

In [None]:
# LLM
from langchain.chat_models import ChatOllama
llm = ChatOllama(model_name="llama2", temperature=0.7)

# RAGs

## Basic RAG

In [None]:
from langchain.chat_models import ChatOllama
from langchain_core.prompts.chat import ChatPromptTemplate

# Prompt
template = """Please list the semantic keywords associated with the following user question. Return as a comma-separated list. Also, answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# Chain
chain = prompt | llm

In [None]:
# Run
chain.invoke({"context": docs, "question": "What are pending mega grants?"})

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What are pending mega grants?")

## Multi Query

In [None]:
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chat_models import ChatOllama

question = 'What are insights?'

template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOllama(model_name="llama2", temperature=1) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

generate_queries.invoke({"question": question})

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [None]:
from operator import itemgetter
from langchain.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOllama(model_name="llama2", temperature=0)
retrieval_chain = generate_queries | retriever.map() | get_unique_union

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

## RAG-Fusion

In [None]:
from langchain_core.prompts.chat import ChatPromptTemplate

template = """You are a helpful assistant thinks through a question based on a context. Reply only with a list of questions. \n
For context: {context}, generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain.chat_models import ChatOllama

generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    for doc in reranked_results:
        print(doc[0].metadata['source'], doc[1])
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
# docs = retrieval_chain_rag_fusion.invoke({"question": question})
# len(docs)

In [None]:
# len(docs)
# for doc in docs:
#     print(doc[0].metadata['source'])

In [None]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter

# Retrieve
question = input("Q:")
# question = "What are pending mega grants?"

# print("Thinking these questions ------------")
questions = generate_queries.invoke({"question": question, "context": retriever})
for question in questions:
    print(question)
# docs = retrieval_chain_rag_fusion.invoke({"question": question})

# retrieval_chain = generate_queries | retriever.map() | get_unique_union
# docs = retrieval_chain.invoke({"context": retriever, "question": question})
# for doc in docs:
#     print(doc.metadata['source'])
# print(len(docs))

# RAG
template = """Answer the following questions ONLY from this context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm  
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question, "context": retriever}).split('\n\n')

## Decomposition

In [None]:
from langchain_core.prompts.chat import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.output_parsers import StrOutputParser

question = "What are mega grants?"

# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
questions = generate_queries_decomposition.invoke({"question": question})

In [None]:
questions

In [None]:
# Prompt
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [None]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

In [None]:
answer

In [None]:
template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
"""

prompt_rag = ChatPromptTemplate.from_template(template)

def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

In [None]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":context,"question":question})

## Contextual Compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1} ({doc.metadata['source']}):\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

compressed_docs = compression_retriever.invoke(
    "What are mega grants?"
)

pretty_print_docs(compressed_docs)

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=embeddings_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "What are mega grants?"
)

pretty_print_docs(compressed_docs)

## Self Query

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="topic",
        description="The topic describing the content",
        type="string",
    ),
    AttributeInfo(
        name="summary",
        description="A brief summary of the content",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Semantic keywords associated with the document",
        type="string",
    )
]

print(metadata_field_info)

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chat_models import ChatOpenAI

document_content_description = "B3i usage guides"

llm = ChatOpenAI(temperature=0.7)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info
)

In [None]:
retriever.get_relevant_documents("What are mega grants?")

In [None]:
docs = retriever.invoke("What are mega grants?")
for doc in docs:
    print(doc)

In [None]:
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

template = """
You are a helpful assistant and your job is to answer the following question only from this context. Make it layman and easy to understand. Suggest steps to explore the product and explain how to interact. Do not make stuff up, if there's no data, say you don't know
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retriever, 
     "question": itemgetter("question")} 
    | prompt
    | llm  
    | StrOutputParser()
)

question = input()
final_rag_chain.invoke({"question": question})