In [None]:
'''
RAG - Multi-Query Query translation
Besides using the LangChain Chain or Pipeline illustrating in the LangChain Rag tutorial, 
the function in the last cell executes the same logic, step by step, which helps me to 
understand the processes and also be able to improve the prompt better without 
switching to langsmith often.

Please review the RAG tutorial from Langchain in details (part 5 - multi-query query translation)
https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb

'''


#RAG-- Common RAG 
import os
from dotenv import load_dotenv
load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'trace'  #true for trace
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_PROJECT"] = "RAG - Multi-Query-Translation"

In [None]:
#https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
#RAG-- Common RAG 
# import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

global_index = 0
#Using CharaterTextSplitter is much better than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
#Chunk size has no effect on CharacterTextSplitter

In [None]:
####Example: Document retriving from Web ####
'''
# Load Documents from Web
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
doc = loader.load()
docs = text_splitter.split_documents(doc)
'''

In [None]:
#Indexing
#Loading Document from PDF, Sam's resume from local,  instead of Web
#RAG-- Common Rag

from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data

pdf_text = extract_pdf_text('c:\\workspace\\python\\csv\\docs\\samcyangResume_Gen123.pdf')
pdf_texts = text_splitter.split_text(pdf_text)
split_docs = text_splitter.create_documents(pdf_texts)

In [None]:
#Retrieving
# RAG-- Common Rag

vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)

In [None]:
#Now, Multi-Query or Query-Translation
from langchain.prompts import ChatPromptTemplate
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives

template = """You are an AI language model assistant. Your task is to generate 5
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.  Original question: {question}"""
'''
template = """A resume contains 1. the name, the phone number and 
the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, 
starting and ending dates, and responsibilities of that person.  4. the 
awards or social networking information of this person. Original question: {question}"""
'''

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
# Multi Query: Different Perspectives
from langchain.load import dumps, loads


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
#question = "Which Universities Sam have attended to?"
# Retrieve
# with set_debug and set_verbose, you can see the intermediated questions generated from original question.

from langchain.globals import set_verbose, set_debug

#set_debug(True)
#set_verbose(True)

retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

#set_debug(False)
#set_verbose(False)

In [None]:
# Multi Query: Different Perspectives
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
'''
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
'''

template =  """Answer the question based only on the following context 
with assumptions that A resume contains 1. the name, the phone number and 
the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, 
starting and ending dates, and responsibilities of that person.  4. the 
awards or social networking information of this person.

{context}

Question: {question}
"""


prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [269]:
#Instead of using Pipeline/Chain but step by step that can helpt od understand
# the process and enanable for easy debugging
'''
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
retrieval_chain = generate_queries | retriever.map() | get_unique_union

and
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

'''
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.load import dumps, loads
from PyPDF2 import PdfReader
#Using CharaterTextSplitter is much better than RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()

def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]



def step_by_step_query(user_question, resume):
    from langchain_core.output_parsers import CommaSeparatedListOutputParser
    #question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
# Multi Query: Different Perspectives
    question = user_question
    pdf_text = extract_pdf_text(resume)
    pdf_texts = text_splitter.split_text(pdf_text)
    split_docs = text_splitter.create_documents(pdf_texts)
    
    vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4}, max_tokens_limit=10000)
    
    template = """You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.  Provide these alternative questions separated by newlines.  Original question: {question}"""

    prompt_perspectives_output =  prompt_perspectives.invoke(question)
    #print("Output after prompt_perspectives:", prompt_perspectives_output)

    chat_openai_output = ChatOpenAI(temperature=0)(prompt_perspectives_output)
    ##print("Output after ChatOpenAI:", chat_openai_output)
    #output_parser = CommaSeparatedListOutputParser()
    parser_output = StrOutputParser().invoke(chat_openai_output)
    #print("output after StrOutputParser:", parser_output)
    #str_output_parser_output = output_parser(chat_openai_output)
    #print("Output after StrOutputParser:", str_output_parser_output)

    final_output = (lambda x: x.split("\n"))(parser_output)
    #print("Final Output:", final_output)

    generate_queries = final_output
    # now let's manually do the 
    #retrieval_chain = generate_queries | retriever.map() | get_unique_union
    # and 
    #docs = retrieval_chain.invoke({"question":question})


    retriever_output = retriever.map().invoke(final_output)

    #print("output after retriever map()", retriever_output)

    get_union_doc_output = get_unique_union(retriever_output)

    #print("output after get_unique_union", get_union_doc_output)

    docs = retrieval_chain.invoke({"question":question})

    #print("output after retriever_chain", docs)

    template =  """Answer the question based only on the following context with assumptions that A resume contains 1. the name, the phone number and the address of a person, 2.  the education or attending Schools and years 
of graduation.  3. the companies a person worked for with the job titles, starting and ending dates, and responsibilities of that person.  4. the awards or social networking information of this person.

    {context}

    Question: {question}
    """


    #prompt_perspectives = ChatPromptTemplate.from_template(template)

    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(temperature=0)


    docsf = retrieval_chain.invoke({"question":question})

    #print("output after retriever_chain again.. ", docsf)
    prompt_outputf =  prompt.invoke({"context": docsf, "question":question})
    #print("output from prompt again", prompt_outputf)
    llm_output = llm(prompt_outputf)
    #print("Output from llm again", llm_output)
    print (llm_output.content)


In [272]:
question = "List all the companies Sam have worked for, please also list the year he worked for those companies"
resume='.\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)

1. Tarana Wireless - 10/2022 - 12/2023
2. Miso Robotics - 11/2021 - 09/2022
3. Quasar Science - 6/2020 - 11/2021
4. Riverbed/Xirrus Inc - 10/2012 - 9/2019
5. JigoCity/Ecommerce - 1/2011 - 5/2012
6. Concordware International/China Offshore Software Development - 5/2009 - 10/2010
7. Asoka USA - 8/2007 - 8/2008
8. Boingo Wireless - 11/2006 - 3/2007
9. Infospace Mobile Division - 7-11/2006
10. Telemac - 8/2004 - 6/2006


In [274]:
question = "Does Sam have experience in both hardware and software development"
resume='.\samcyangResume_Gen123.pdf'
step_by_step_query(question, resume)

Based on the provided context, Sam C. Yang has experience in both hardware and software development. This can be inferred from his roles such as NPI Program Manager at Quasar Science, where he worked with internal and external engineer teams on hardware design, and as a Director of Engineering at Boingo Wireless, where he implemented, enhanced, operated, and maintained WISP authentication systems.


In [277]:
question = "how many years of NPI experience Sam has it and in what companies"
resume='.\samcyangResume_Gen456.pdf'
step_by_step_query(question, resume)

Sam has 15+ years of NPI experience. He gained this experience at the following companies:
1. Tarana Wireless
2. Quasar Science
3. Riverbed/Xirrus Inc


In [285]:
# llm responses the wrong answer if you ask list all companies he uses AI and robotics,
question = "Does Sam have AI and robotics experience, and please list the last company and years he uses the AI and Robotics experience?"
resume='.\samcyangResume_Gen456.pdf'
step_by_step_query(question, resume)

Yes, Sam has AI and robotics experience. The last company where he used his AI and Robotics experience is Miso Robotics from 11/2021 - 09/2022.
