In [None]:
'''
RAG - Multi-Query Query translation

Multi-Query translation will create similar queries from original question. Based those
queris, this algorithm will retieve the relevent information/document as part of the
prompt context before the final Q&A

step_to_step_query, at end, is based on the LangChain Rag youtube tutorial below, the code from the tutorial are 
listed in the first half of the python notebook. 

This function follows the same logic from the tutorial but without the abstract of Chain/Pipeline. This approach
it helps me to understand the llm processes and also can improve the prompt preparation.
Of course, the langsmith is a great tool as well. 


Input arguments to the step_to_step_fusion are
1. Document: the resume as the source of information
2. Quesition: question string


Please review the RAG tutorial from Langchain in details (part 5 - multi-query query translation)
https://www.youtube.com/watch?v=JChPi0CRnDY

'''
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 
import os
from dotenv import load_dotenv
load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_PROJECT"] = "RAG - Multi-Query-Translation"

In [None]:
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


#Using CharaterTextSplitter may have better result than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
#Chunk size has no effects on CharacterTextSplitter

In [None]:
####Example: Document retriving from Web ####
'''
# Load Documents from Web
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
doc = loader.load()
docs = text_splitter.split_documents(doc)
'''

In [None]:
#RAG-- Common Rag
# Query Translation:   Multi-Query, Fusion and Decompsoition 

from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data
resume_name = ".\\docs\\samcyangResume_Gen123.pdf"
pdf_text = extract_pdf_text(resume_name)
pdf_texts = text_splitter.split_text(pdf_text)
split_docs = text_splitter.create_documents(pdf_texts)

In [None]:
#Retrieving
# RAG-- Common Rag

vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)

In [None]:
#Now, Multi-Query or Query-Translation
from langchain.prompts import ChatPromptTemplate
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives
#question = "Please provide summary of Sam work experience"
template = """You are an AI language model assistant. Your task is to generate 5
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.  Original question: {question}"""
'''
template = """A resume may include 0. The file name is listed in the first line,
1. The resume contains the name, the phone number and the address of a person, \n
2. The resume contains the degree and attending university and years of graduation.  \n
3. The resume contains the work history of all the companies a person worked for.  
Each job may contain the start and end dates, and job title. If the end date is 
missing then, the job is the current job.  
4. Each job has the job titles, and associated responsibilities or experience of 
that person. Each job is independent from other jobs in the same document. 
5. The resume may contain any awards or social networking information of this person. \n
6. The first job should have the farest year from current date, the last or current job 
has the closest to the current date.  From the last and first job you should be able 
to calculate the total year a person has worked for each job and total years of a person
has been worked.\n 

Original question: {question}"""
'''

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
# Multi Query: Different Perspectives
from langchain.load import dumps, loads


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
#question = "Which Universities Sam have attended to?"
# Retrieve
# with set_debug and set_verbose, you can see the intermediated questions generated from original question.

from langchain.globals import set_verbose, set_debug

#set_debug(True)
#set_verbose(True)

retrieval_chain = generate_queries | retriever.map() | get_unique_union
#docs = retrieval_chain.invoke({"question":question})
#len(docs)

#set_debug(False)
#set_verbose(False)

In [None]:
# Multi Query: Different Perspectives
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
'''
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
'''

template =  """Answer the question based only on the following context 
with assumptions that A resume 0. The file name is listed in the first line,
1. The resume contains the name, the phone number and the address of a person, \n
2. The resume contains the degree and attending university and years of graduation.  \n
3. The resume contains the work history of all the companies a person worked for.  
Each job may contain the start and end dates, and job title. If the end date is 
missing then, the job is the current job.  
4. Each job has the job titles, and associated responsibilities or experience of 
that person. Each job is independent from other jobs in the same document. 
5. The resume may contain any awards or social networking information of this person. \n
6. The first job should have the farest year from current date, the last or current job 
has the closest to the current date.  From the last and first job you should be able 
to calculate the total year a person has worked for each job and total years of a person
has been worked.\n

{context}

Question: {question}
"""


prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [9]:


#Without using the Chain/Pipeline but same processes
'''
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
retrieval_chain = generate_queries | retriever.map() | get_unique_union

and
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

'''
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.load import dumps, loads
from PyPDF2 import PdfReader
#Using CharaterTextSplitter maybe have better results than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
#keep the default separator ('\n\n') is better than separator="\n"

def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]



def step_by_step_query(user_question, resume):
    from langchain_core.output_parsers import CommaSeparatedListOutputParser
    #question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives
    question = user_question
    pdf_text = extract_pdf_text(resume)
    pdf_texts = text_splitter.split_text(pdf_text)
    split_docs = text_splitter.create_documents(pdf_texts)
    ''' Don't use OpenAIEmbeddings
    client = OpenAI()
    embedding_repsonse = client.embeddings.create(input=sentences, 
                                      model="text-embedding-3-large")
     '''
    
    vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)
    
    template = """You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.  Provide these alternative questions separated by newlines.  Original question: {question}"""
    prompt_perspectives = ChatPromptTemplate.from_template(template)

    prompt_perspectives_output =  prompt_perspectives.invoke(question)
    #print("Output after prompt_perspectives:", prompt_perspectives_output)

    chat_openai_output = ChatOpenAI(temperature=0)(prompt_perspectives_output)
    ##print("Output after ChatOpenAI:", chat_openai_output)
    #output_parser = CommaSeparatedListOutputParser()
    parser_output = StrOutputParser().invoke(chat_openai_output)
    #print("output after StrOutputParser:", parser_output)
    #str_output_parser_output = output_parser(chat_openai_output)
    #print("Output after StrOutputParser:", str_output_parser_output)

    generate_queries= (lambda x: x.split("\n"))(parser_output)

    print (generate_queries)

    #retrieval_chain = generate_queries | retriever.map() | get_unique_union
    # and 
    #docs = retrieval_chain.invoke({"question":question})

    retriever_output = retriever.map().invoke(generate_queries)

    #print("output after retriever map()", retriever_output)

    get_union_doc_output = get_unique_union(retriever_output)

    #print("output after get_unique_union", get_union_doc_output)

    #docs = retrieval_chain.invoke({"question":question})
    #docs = get_union_doc_output

    #print("output after retriever_chain", docs)

    template =  """Answer the question based only on the following context 
    with assumptions that A resume may include
    0. The file name is listed in the first line,
    1. The resume contains the name, the phone number and the address of a person, \n
    2. The resume contains the degree and attending university and years of graduation.  \n
    3. The resume contains the work history of all the companies a person worked for.  
    Each job may contain the start and end dates, and job title. If the end date is 
    missing then, the job is the current job.  
    4. Each job has the job titles, and associated responsibilities or experience of 
    that person. Each job is independent from other jobs in the same document. 
    5. The resume may contain any awards or social networking information of this person. \n
    6. The first job should have the farest year from current date, the last or current job 
    has the closest to the current date.  From the last and first job you should be able 
    to calculate the total year a person has worked for each job and total years of a person
    has been worked.\n
    {context}

    Question: {question}
    """


    #prompt_perspectives = ChatPromptTemplate.from_template(template)

    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(temperature=0)


    #docsf = retriever.map().invoke({"question":question})

    #print("output after retriever_chain again.. ", docsf)
    prompt_outputf =  prompt.invoke({"context": get_union_doc_output[0], "question":question})
    #print("output from prompt again", prompt_outputf)
    llm_output = llm(prompt_outputf)
    #print("Output from llm again", llm_output)
    print (llm_output.content)


In [10]:
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
resume='.\docs\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)

['1. Can you provide a list of companies where Sam has been employed along with the corresponding years of his employment?', '2. Which companies has Sam worked for, and can you include the years during which he was employed at each company?', '3. Please list the companies that Sam has worked at, and include the years of his employment at each company.', '4. Could you give me a rundown of the companies where Sam has held positions, including the years he worked at each company?', "5. I'm interested in knowing the companies that Sam has worked for and the specific years he was employed at each company. Can you provide this information?"]
1. Tarana Wireless - 10/2022 - 12/2023
2. Miso Robotics - 11/2021 - 09/2022
3. Quasar Science - 6/2020 - 11/2021
4. Riverbed/Xirrus Inc - 10/2012 - 9/2019
5. JigoCity/Ecommerce - 1/2011 - 5/2012
6. Concordware International/China Offshore Software Development - 5/2009 - 10/2010
7. Asoka USA - 8/2007 - 8/2008
8. Boingo Wireless - 11/2006 - 3/2007
9. Infos

In [11]:
question = "Does Sam have experience in both hardware and software development"
resume='.\docs\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)
#using default separator in splitter is better than setting separator='\n'

["1. What is Sam's background in hardware and software development?", '2. Can Sam demonstrate proficiency in both hardware and software development?', '3. Has Sam worked on projects involving both hardware and software development?', '4. Is Sam skilled in both hardware and software development?', "5. Are there any examples of Sam's experience in hardware and software development?"]
Yes, Sam has experience in both hardware and software development.


In [12]:
question = "how many years of NPI experience Sam has it and in what companies"
resume='.\docs\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)
#Doesn't answer the years correctly, may need to find-tune the template#

['1. What is the duration of NPI experience that Sam possesses, and which companies has he gained this experience in?', '2. In which companies has Sam accumulated his NPI experience, and for how many years?', '3. How long has Sam been involved in NPI, and which companies have contributed to his experience?', '4. Can you provide information on the number of years Sam has worked in NPI, and the companies where he has gained this experience?', '5. Which companies has Sam worked for to gain his NPI experience, and for how many years has he been involved in this field?']
Sam has 15+ years of NPI experience and has worked in the following companies for NPI roles:
1. Tarana Wireless from 10/2022 to 12/2023
2. Miso Robotics from 11/2021 to 09/2022
3. Quasar Science from 6/2020 to 11/2021
4. Riverbed/Xirrus Inc from 10/2012 to 9/2019


In [13]:
# llm responses the wrong answer if you ask list all companies he uses AI and robotics,
question = "Does Sam have AI and robotics experience, and please list the last company and years he uses the AI and Robotics experience?"
resume='.\docs\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)
#The answer is not ideas

["1. What is Sam's experience with AI and robotics, and can you provide details on the last company he worked for and the years he utilized his AI and robotics skills?", "2. Can you tell me about Sam's background in AI and robotics, including information on the most recent company where he applied his expertise and the duration of his involvement?", "3. How extensive is Sam's experience in AI and robotics, and could you specify the company where he last worked with these technologies along with the timeframe?", "4. I'm interested in Sam's AI and robotics experience - could you share details about the last company he was employed at and the years during which he gained experience in these fields?", "5. What can you tell me about Sam's involvement in AI and robotics, particularly at his most recent company? Please include the years he spent utilizing his skills in these areas."]
Yes, Sam has AI and robotics experience. The last company where he uses AI and Robotics experience is Miso Rob