In [None]:
'''
RAG - Multi-Query Query translation

Multi-Query translation will create similar queries from original question. Based those
queris, this algorithm will retieve the relevent information/document as part of the
prompt context before the final Q&A

step_to_step_query, at end, is based on the LangChain Rag youtube tutorial below, the code from the tutorial are 
listed in the first half of the python notebook. 

This function follows the same logic from the tutorial but without the abstract of Chain/Pipeline. This approach
it helps me to understand the llm processes and also can improve the prompt preparation.
Of course, the langsmith is a great tool as well. 


Input arguments to the step_to_step_fusion are
1. Document: the resume as the source of information
2. Quesition: question string


Please review the RAG tutorial from Langchain in details (part 5 - multi-query query translation)
https://www.youtube.com/watch?v=JChPi0CRnDY

'''
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 
import os
from dotenv import load_dotenv
load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_PROJECT"] = "RAG - Multi-Query-Translation"

In [None]:
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


#Using CharaterTextSplitter may have better result than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
#Chunk size has no effects on CharacterTextSplitter

In [None]:
####Example: Document retriving from Web ####
'''
# Load Documents from Web
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
doc = loader.load()
docs = text_splitter.split_documents(doc)
'''

In [None]:
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 

from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data
resume_name = ".\\docs\\samcyangResume_Gen123.pdf"
pdf_text = extract_pdf_text(resume_name)
pdf_texts = text_splitter.split_text(pdf_text)
split_docs = text_splitter.create_documents(pdf_texts)

In [None]:
#Retrieving
# RAG-- Common Rag

vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)

In [None]:
#Now, Multi-Query or Query-Translation
from langchain.prompts import ChatPromptTemplate
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives
#question = "Please provide summary of Sam work experience"
template = """You are an AI language model assistant. Your task is to generate 5
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.  Original question: {question}"""
'''
template = """A resume contains 1. the name, the phone number and 
the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, 
starting and ending dates, and responsibilities of that person.  4. the 
awards or social networking information of this person. Original question: {question}"""
'''

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
# Multi Query: Different Perspectives
from langchain.load import dumps, loads


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
#question = "Which Universities Sam have attended to?"
# Retrieve
# with set_debug and set_verbose, you can see the intermediated questions generated from original question.

from langchain.globals import set_verbose, set_debug

#set_debug(True)
#set_verbose(True)

retrieval_chain = generate_queries | retriever.map() | get_unique_union
#docs = retrieval_chain.invoke({"question":question})
#len(docs)

#set_debug(False)
#set_verbose(False)

In [None]:
# Multi Query: Different Perspectives
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
'''
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
'''

template =  """Answer the question based only on the following context 
with assumptions that A resume contains 1. the name, the phone number and 
the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, 
starting and ending dates, and responsibilities of that person.  4. the 
awards or social networking information of this person.

{context}

Question: {question}
"""


prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [25]:

#Without using the Chain/Pipeline but same processes
'''
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
retrieval_chain = generate_queries | retriever.map() | get_unique_union

and
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

'''
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.load import dumps, loads
from PyPDF2 import PdfReader
#Using CharaterTextSplitter maybe have better results than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
#keep the default separator ('\n\n') is better than separator="\n"

def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]



def step_by_step_query(user_question, resume):
    from langchain_core.output_parsers import CommaSeparatedListOutputParser
    #question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives
    question = user_question
    pdf_text = extract_pdf_text(resume)
    pdf_texts = text_splitter.split_text(pdf_text)
    split_docs = text_splitter.create_documents(pdf_texts)
    ''' Don't use OpenAIEmbeddings
    client = OpenAI()
    embedding_repsonse = client.embeddings.create(input=sentences, 
                                      model="text-embedding-3-large")
     '''
    
    vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)
    
    template = """You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.  Provide these alternative questions separated by newlines.  Original question: {question}"""

    prompt_perspectives_output =  prompt_perspectives.invoke(question)
    #print("Output after prompt_perspectives:", prompt_perspectives_output)

    chat_openai_output = ChatOpenAI(temperature=0)(prompt_perspectives_output)
    ##print("Output after ChatOpenAI:", chat_openai_output)
    #output_parser = CommaSeparatedListOutputParser()
    parser_output = StrOutputParser().invoke(chat_openai_output)
    #print("output after StrOutputParser:", parser_output)
    #str_output_parser_output = output_parser(chat_openai_output)
    #print("Output after StrOutputParser:", str_output_parser_output)

    generate_queries= (lambda x: x.split("\n"))(parser_output)

    print (generate_queries)

    #retrieval_chain = generate_queries | retriever.map() | get_unique_union
    # and 
    #docs = retrieval_chain.invoke({"question":question})

    retriever_output = retriever.map().invoke(generate_queries)

    print("output after retriever map()", retriever_output)

    get_union_doc_output = get_unique_union(retriever_output)

    print("output after get_unique_union", get_union_doc_output)

    #docs = retrieval_chain.invoke({"question":question})
    #docs = get_union_doc_output

    #print("output after retriever_chain", docs)

    template =  """Answer the question based only on the following context with assumptions that A resume contains 1. the name, the phone number and the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, starting and ending dates, and responsibilities of that person.  4. the awards or social networking information of this person.

    {context}

    Question: {question}
    """


    #prompt_perspectives = ChatPromptTemplate.from_template(template)

    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(temperature=0)


    #docsf = retriever.map().invoke({"question":question})

    #print("output after retriever_chain again.. ", docsf)
    prompt_outputf =  prompt.invoke({"context": get_union_doc_output, "question":question})
    #print("output from prompt again", prompt_outputf)
    llm_output = llm(prompt_outputf)
    #print("Output from llm again", llm_output)
    print (llm_output.content)


In [26]:
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
resume='.\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)

['1. Can you provide a list of companies where Sam has been employed along with the corresponding years of his employment?', '2. Which companies has Sam worked for, and can you include the years he worked at each company?', "3. I'm interested in knowing the companies that Sam has worked at and the specific years he was employed at each company. Can you provide this information?", '4. Could you list the companies where Sam has gained work experience, including the years he spent at each company?', '5. Please provide a comprehensive list of the companies where Sam has worked, along with the respective years of his employment at each company.']
output after retriever map() [[Document(page_content='Sam C. Yang  \n21226  Ventura Blvd., #358, Woodland Hills California 91364            \nVoice:    US: (818) 929 -9292;  \nEmail: samcyang2004@yahoo.com  \n \nObjective  \nBring  best products to  the market with high profit return  \n \nSummary  \n• 20+ years of Software development/PM experienc

In [20]:
question = "Does Sam have experience in both hardware and software development"
resume='.\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)
#using default separator in splitter is better than setting separator='\n'

["1. What is Sam's background in hardware and software development?", '2. Can Sam demonstrate proficiency in both hardware and software development?', '3. Has Sam worked on projects involving both hardware and software development?', '4. Is Sam skilled in both hardware and software development?', '5. Does Sam possess expertise in both hardware and software development?']
Yes, Sam has experience in both hardware and software development.


In [21]:
question = "how many years of NPI experience Sam has it and in what companies"
resume='.\samcyangResume_Gen456.pdf'
step_by_step_query(question, resume)
#Doesn't answer the years correctly, may need to find-tune the template#

['1. What is the duration of NPI experience that Sam possesses and which companies has he gained this experience in?', '2. In which companies has Sam accumulated his NPI experience over the years?', '3. How many years of NPI experience does Sam have, and where has he worked to gain this experience?', '4. Which companies has Sam worked for to acquire his NPI experience, and for how many years?', '5. Can you provide details on the duration of NPI experience Sam has and the companies where he gained this experience?']
Sam has 15+ years of NPI experience. He gained this experience at the following companies:
1. Tarana Wireless
2. Quasar Science
3. Riverbed/Xirrus Inc


In [22]:
# llm responses the wrong answer if you ask list all companies he uses AI and robotics,
question = "Does Sam have AI and robotics experience, and please list the last company and years he uses the AI and Robotics experience?"
resume='.\samcyangResume_Gen456.pdf'
step_by_step_query(question, resume)
#The answer is not ideas

["1. What is Sam's experience with AI and robotics, and can you provide details on the last company he worked for and the years he utilized his AI and robotics skills?", "2. Can you tell me about Sam's background in AI and robotics, including information on his most recent company and the timeframe during which he applied his expertise in this field?", '3. Has Sam worked with AI and robotics before, and if so, could you specify the company where he gained this experience and the duration of his involvement?', "4. I'm interested in Sam's AI and robotics experience. Could you share details about his previous company and the years he spent working with AI and robotics technologies?", "5. What is Sam's history with AI and robotics, and can you provide insights into the last company he was associated with and the period during which he utilized his AI and robotics knowledge?"]
Yes, Sam has AI and robotics experience. The last company where he used his AI and Robotics experience is Miso Robo