In [None]:
'''
RAG - Decomposition question to sub-questions and aggregating each sub-question/answar pair
as part of prompt context in the next sub-question prompt till final.

RAG - Decomposition question to sub-questions, and provide the final answer from the 
retrieved documents of each sub-question.

Decomposition translation is different from multi-query or fusion, instead of 
creating similar questions, it creates sub-questions from the original question. 

step_to_step_decomposition, at end, is based on the LangChain Rag youtube tutorial below, 
the code from the tutorial is listed in the first half of the ipython notebook. 

This function follows the same logic from the tutorial but without the abstract of 
Chain/Pipeline. This approach helps me to understand the sub-questions creation, 
llm processes and also can improve the final prompt preparation. 

Of course, the langsmith is a great tool as well. 

I may be wrong, but it seems to me the separate the sub-questions as individual questions
can return more accurate results than the aggregation approach. 
However, the aggregation decomposition approach provide more details in answer.


Input arguments to the step_to_step_fusion are
1. Quesition: question string
2. Additional_background:  the helping instructions you want the ChatPGT to return the Q&A 
with a better answer.
3. Aggregation: 1 means aggregating sub-questions as prompt for the next sub-question Q&A


Please review the RAG tutorial from Langchain in details (part 7 
- decompistion query translation)
https://www.youtube.com/watch?v=h0OPWlEOank

'''
#RAG-- Common Rag
#Query Translation:   Multi-Query, Fusion and Decompsoition 
import os
from dotenv import load_dotenv
load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_PROJECT"] = "RAG - Decomposition-Query-Translation"

In [None]:
#RAG-- Common Rag
#Query Translation:   Multi-Query, Fusion and Decompsoition 
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text_splitter = CharacterTextSplitter()
question = "Please list all the documents that contains the robotic experience"
question="List all candidates by name that who have Java work experience?"

In [None]:
#RAG-- Common Rag
#Query Translation:   Multi-Query, Fusion and Decompsoition 
#Loading PDF resumes from local instead of the Web

import uuid
import hashlib
from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data


resume_dir = ".\\docs\\"
pdf_text = []


#Change 1
embeddings = OpenAIEmbeddings()
vectorstore = Chroma("langchain", embeddings)



def create_uuid_from_string(val: str):
    hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
    return uuid.UUID(hex=hex_string)


for file in os.listdir(resume_dir):
    filepath = os.path.join(resume_dir,file)
    collection = vectorstore.get('langchain')
    if (filepath.endswith('.pdf')): 
        #Change 2
        pdf_text=[]
        pdf_text.append ("File Name: "+filepath+"  \n"+extract_pdf_text(filepath))
        split_docs = text_splitter.create_documents(pdf_text)
        existing = vectorstore.get(file)
       
        if (existing['ids'] != [] and existing['ids'][0] == file):
            print("Deleting Duplication .....", file)
            vectorstore.delete(file)
        # Add documents back to collection
        try:
            #print("Split Doc   ", split_docs)
            #Need to provide IDS list to add_documents, otherwise, it will only pick up the first character of the file name
            langchain_ids = vectorstore.add_documents(ids=[file], documents=split_docs) 
            print("Adding Langchain ID - ", langchain_ids, " File Name - ", file)
            #langchain_ids should be equal to file str
        except:
            #print("Again....Deleting Duplication .....", file)
            vectorstore.delete(file)
            #print("Existing... ", existing['ids'])
            print("Can't add.. ", file)




In [None]:
#Retrieving
#RAG-- Common Rag

#vectorstore = Chroma.from_documents(documents=split_docs, embedding = embedding)
#retriever=vectorstore.as_retriever(search_kwargs={"k": 2}, max_tokens_limit=10000)
retriever=vectorstore.as_retriever(search_kwargs={"k": 2}, max_tokens_limit=10000,
                                   collection_metadata={"hnsw:M": 1024,"hnsw:ef": 64})

In [None]:
#Decomposition - Method 1, Aggreating subquestions.
from langchain.prompts import ChatPromptTemplate
#question = "Please list all the documents that contains the robotic experience"
#Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question about a resume. \n
The goal is to break down the input question into a set of sub-question about the experience listed in a resume, and each sub-question that can be answers in separately \n
However, you must keep the principle of the origin question in sub-question \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
#Decomposition - Method 1  Continue
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
#question="How long has each candidate been involved with robotics, and what roles have they held in this field?"
#question ="Please list all the candidate name that who has the robotic experience"
# LLM
question="List all candidates by name that who have Java work experience?"
llm = ChatOpenAI(temperature=0.2)
# ***relaxing temperature between 0 - 0.5 may have a better sub-questions generated.
# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))
# Run
questions = generate_queries_decomposition.invoke({"question":question})

In [None]:
#Decomposition - Method 1 Continue

context = """Each document (or resume) contains

0. The file name at the first line,
1. the name, the phone number and the address of a person, \n
2. the education or attending Schools and years of graduation.  \n
3. the work histroy lists all the companies a person worked for.  Each job contains the start and end dates.
the end date can be missing since it is current job.  Each job has the job titles, and associated 
responsibilities or experience of that person. Ecah job is indpendent from other jobs in a document. 
4. Any awards or social networking information of this person. \n
5. Each job The first job should have the farest year from current date, the last or currnet job has the 
year closest to the current date.  From the last and first job you should be able to calcuate
the total year a person has worked for that job \n

"""

template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the answer of the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [None]:
#Decomposition - Method 1 Continue
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    #print("format String...", formatted_string.strip(), "\n")
    return formatted_string.strip()

# llm
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

q_a_pairs = ""

for q in questions:
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair
print("Aggregation answer is ...", answer)

In [None]:
#Decomposition - Method 2, treat sub questions as separated question and 
#Answer each sub-question individually before passing final RAG Q&A

from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

# RAG prompt
#prompt_rag = hub.pull("rlm/rag-prompt")
#retriever.get_relevant_documents(sub_question)
#calling invoke direclty which will call get_relevant_documents

'''
#Original Codes
def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        print("subquestion::: ", sub_question)
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        #Same as Invoke, since Invoke will call the get_relevant_documents evatually 
        #Use retrieved documents and sub-question separately in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        #print ("answer:::", answer)
        rag_results.append(answer)
    
    return rag_results,sub_questions
'''
question="How long has each candidate been involved with robotics, and what roles have they held in this field?"
template = """You are a helpful assistant that generates multiple sub-questions related to an input question about a resume. \n
The goal is to break down the input question into a set of sub-question about the experience listed in a resume, and each sub-question that can be answers in separately \n
However, you must keep the principle of the origin question in sub-question \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_rag2 = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(temperature=0.2)
# ***relaxing temperature between 0 - 0.5 may have a better sub-questions generated.
# Chain
generate_queries_decomposition2 = ( prompt_rag2 | llm | StrOutputParser() | (lambda x: x.split("\n")))
questions = generate_queries_decomposition2.invoke({"question":question})

context = """Each document (or resume) contains

0. The file name at the first line,
1. the name, the phone number and the address of a person, \n
2. the education or attending Schools and years of graduation.  \n
3. the work histroy lists all the companies a person worked for.  Each job contains the start and end dates.
the end date can be missing since it is current job.  Each job has the job titles, and associated 
responsibilities or experience of that person. Ecah job is indpendent from other jobs in a document. 
4. Any awards or social networking information of this person. \n
5. Each job The first job should have the farest year from current date, the last or currnet job has the 
year closest to the current date.  From the last and first job you should be able to calcuate
the total year a person has worked for that job \n

"""

template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is additional context relevant to the answer of the question: 

\n --- \n {context} \n --- \n
Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

#decomposition_prompts by removing the last question and answer pair in the prompt by comparing to the method 1
'''Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n
'''
decomposition_prompt2 = ChatPromptTemplate.from_template(template)
answers = []
for q in questions:
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question") } 
    | decomposition_prompt2
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q})
    #q_a_pair = format_qa_pair(q,answer)
    #q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair
    #print("answer...", answer)
    answers.append(answer)

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
# with 2nd method, the subquestions generation is the same as the 1st method
#answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

In [None]:
#Decomposition - Method 2 Continue
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

template = """Here is a set of quest pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""

'''
also, can be included following as additional_background 
the context contains

0. The file name at the first line,
1. the name, the phone number and the address of a person, \n
2. the education or attending Schools and years of graduation.  \n
3. the work histroy lists all the companies a person worked for.  Each job contains the start and end dates.
the end date can be missing since it is current job.  Each job has the job titles, and associated 
responsibilities or experience of that person. Ecah job is indpendent from other jobs in a document. 
4. Any awards or social networking information of this person. \n
5. Each job The first job should have the farest year from current date, the last or currnet job has the 
year closest to the current date.  From the last and first job you should be able to calcuate
the total year a person has worked for that job \n : {question}
'''

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag = final_rag_chain.invoke({"context":context,"question":question})
print ("final_separation_rag is ",  final_rag)

In [None]:
# Query Translation:  step_by_step_decomposition with aggregation and separation as one function

import os
from dotenv import load_dotenv
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import uuid
import hashlib
from PyPDF2 import PdfReader
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
#question = "Please list all the documents that contains the robotic experience"
question="How long has each candidate been involved with robotics, and what roles have they held in this field?"

def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    #print("format string... ", formatted_string, "\n")
    return formatted_string.strip()
    
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

def step_by_step_decomposition(question, additional_background, selection):
    load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
    os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
    os.environ["LANGCHAIN_PROJECT"] = "RAG - Decomposition-Query-Translation"
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_splitter = CharacterTextSplitter()
    resume_dir = ".\\docs\\"
    pdf_text = []
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma("langchain", embeddings)
    for file in os.listdir(resume_dir):
        filepath = os.path.join(resume_dir,file)
        collection = vectorstore.get('langchain')
        if (filepath.endswith('.pdf')): 
        #Change 2
            pdf_text=[]
            pdf_text.append ("File Name: "+filepath+"  \n"+extract_pdf_text(filepath))
            split_docs = text_splitter.create_documents(pdf_text)
            existing = vectorstore.get(file)
            print("adding... ", filepath)

            if (existing['ids'] != [] and existing['ids'][0] == file):
                print("Deleting Duplication .....", file)
                vectorstore.delete(file)
            # Add documents back to collection
            try:
                #print("Split Doc   ", split_docs)
                #Need to provide IDS list to add_documents, otherwise, it will only pick up the first character of the file name
                langchain_ids = vectorstore.add_documents(ids=[file], documents=split_docs) 
                #print("Adding Langchain ID - ", langchain_ids, " File Name - ", file)
                #langchain_ids should be equal to file str
            except:
                #print("Again....Deleting Duplication .....", file)
                vectorstore.delete(file)
                #print("Existing... ", existing['ids'])
                print("Can't add.. ", file)
                
    retriever=vectorstore.as_retriever(max_tokens_limit=10000,
                                       collection_metadata={"hnsw:M": 1024,"hnsw:ef": 64})
# Adding search_kwargs or max_tok to avoid the following runtime error
# Cannot return the results in a contigious 2D array. Probably ef or M is too small"
# search_kwargs={"k": 2}, 
    template_subquestions = """You are a helpful assistant that generates multiple sub-questions related to an input question about a resume. \n
The goal is to break down the input question into a set of sub-question about the experience listed in a resume, and each sub-question that can be answers in separately \n
However, you must keep the principle of the origin question in sub-question \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""

    prompt_decomposition = ChatPromptTemplate.from_template(template_subquestions)

    #llm = ChatOpenAI(temperature=0.2)
    # ***relaxing temperature between 0 - 0.5 may have a better sub-questions generated.

    # Chain
    #generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

    
    prompt_decomposition_output =  prompt_decomposition.invoke(question)
    #print("Output after prompt_perspectives:", prompt_decomposition_output)

    chat_openai_output = ChatOpenAI(temperature=0.2)(prompt_decomposition_output)
        ##print("Output after ChatOpenAI:", chat_openai_output)
    
    parser_output = StrOutputParser().invoke(chat_openai_output)
        #print("output after StrOutputParser:", parser_output)
        #str_output_parser_output = output_parser(chat_openai_output)
        #print("Output after StrOutputParser:", str_output_parser_output)
    questions = parser_output.split("\n")
    #print("questions len...", len(questions), "...", questions)
    

    template = """Here is the question you need to answer:

    \n --- \n {question} \n --- \n

    Here is any available background question + answer pairs:

    \n --- \n {q_a_pairs} \n --- \n

    Here is additional background \n --- \n {background}
    and context relevant to the answer of the question: 

    \n --- \n {context} \n --- \n
    Use the above context and any background question + answer pairs to answer the question: \n {question}
    """

    decomposition_prompt = ChatPromptTemplate.from_template(template)
    
    #Decomposition - Method 1 Continue




# llm
    llm = ChatOpenAI(temperature=0.2)
    answers = []
    q_a_pairs = ""
    for q in questions:
        print("question... ", q, "\n")
        docs = retriever.invoke(q)
        print("docs... ", docs)
        if (selection == 1):
            aggregations = decomposition_prompt.invoke({"question": q,
                                                        "q_a_pairs": q_a_pairs, 
                                                        "context": docs,
                                                        "background": additional_background})
        else:
            q_a_pairs = ""
            aggregations = decomposition_prompt.invoke({"question": q,
                                                        "q_a_pairs": q_a_pairs,
                                                        "context": docs,
                                                        "background": additional_background})
            
        chat_openai_output = llm(aggregations)
        parser_output = StrOutputParser().invoke(chat_openai_output)
        #print("Parser Output ", parser_output)
        if (selection == 1): 
            final_rag1 = parser_output
            q_a_pair = format_qa_pair(q, parser_output)
            q_a_pairs = q_a_pairs + "\n" + q_a_pair
        else:        
            answers.append(parser_output)
            
        
    if (selection != 1):
        context = format_qa_pairs(questions, answers)
        #print("context....", context)
        template = """Here is a set of quest pairs:

        {context}

        Use these to synthesize an answer to the question: {question}
        """
        prompt = ChatPromptTemplate.from_template(template)
        separation = prompt.invoke({"question": question, "context": context})
        chat_openai_output = llm(separation)
        final_rag2 = StrOutputParser().invoke(chat_openai_output)
        print("\n\n\nFinal Rag - Separation answer is:    ", final_rag2)
    else:
        print("\n\n\nFinal Rag - Aggregation answer is:    ", final_rag1)


In [None]:
'''
additional_background = """Each document (or resume) contains

    0. The file name at the first line,
    1. the name, the phone number and the address of a person, \n
    2. the education or attending Schools and years of graduation.  \n
    3. the work histroy lists all the companies a person worked for.  Each job contains the start and end dates.
    the end date can be missing since it is current job.  Each job has the job titles, and associated 
    responsibilities or experience of that person. Ecah job is indpendent from other jobs in a document. 
    4. Any awards or social networking information of this person. \n
    5. Each job The first job should have the farest year from current date, the last or currnet job has the 
    year closest to the current date.  From the last and first job you should be able to calcuate
    the total year a person has worked for that job \n
    
    
"""
'''

additional_background = ""
question="How long has each candidate been involved with robotics, and what roles have they held in this field?"
step_by_step_decomposition(question, additional_background, 1)

In [None]:
additional_background = """Each document (or resume) contains

    0. The file name at the first line,
    1. the name, the phone number and the address of a person, \n
    2. the education or attending Schools and years of graduation.  \n
    3. the work histroy lists all the companies a person worked for.  Each job contains the start and end dates.
    the end date can be missing since it is current job.  Each job has the job titles, and associated 
    responsibilities or experience of that person. Ecah job is indpendent from other jobs in a document. 
    4. Any awards or social networking information of this person. \n
    5. Each job The first job should have the farest year from current date, the last or currnet job has the 
    year closest to the current date.  From the last and first job you should be able to calcuate
    the total year a person has worked for that job \n
    
    
"""
additional_background = ""
question="Who have the Java experience?"
step_by_step_decomposition(question, additional_background, 0)

In [None]:
question="Please provide summary of all candidates"
step_by_step_decomposition(question, context, 0)

In [None]:
#cleanup the vectordb
def clean_vectorstore():
    embeddings = OpenAIEmbeddings()
    ectorstore = Chroma("langchain", embeddings)
    ids = vectorstore.get().get('ids')
    len(ids)
    print("ids ", ids)
    for id in ids: 
        docs = vectorstore.get(id)
        print("docs ", docs)
        print(id)
        vectorstore.delete(id)

In [None]:
clean_vectorstore()