In [None]:
'''
RAG - Multi-Query Query translation

Besides using the LangChain Chain/Pipeline as illustrated in the LangChain Rag tutorial below, this program reads the PDF resumes from a directory, each resume will be safed as a document without duplication.  
We also create a single step_by_step_fusion (question, template) program that executes the same logic but without the abstract of Chain/Pipeline. It helps me to understand the processes and also be able to improve the template (prompt or question) preparation better. Of course, langSmith is a great tool as well
The template input that passes to step_by_step_fusion is the format you want the ChatPGT to return.

Such that the final process of Q&A should have a better answer.
Please review the RAG tutorial from Langchain in details (part 6 - multi-fusion query translation)https://www.youtube.com/watch?v=77qELPbNgxA
'''

#RAG-- Common Rag
# Multi-Query, Fusion and Decompsoition 

import os
from dotenv import load_dotenv
load_dotenv()

#if (GCP_PROJECT_ID == None): print ("Not set")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [None]:

#RAG-- Common Rag
# import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

os.environ["LANGCHAIN_PROJECT"] = "RAG Fusion"

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
text_splitter = CharacterTextSplitter()
question = "Please identify all the documents that have the robotic experience"

In [None]:
####Example: Document retriving from Web ####
'''
# Load Documents from Web
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
doc = loader.load()
docs = text_splitter.split_documents(doc)
'''

In [None]:
#Indexing
#Loading Document from PDF, Sam's resume from local,  instead of Web
#RAG-- Common Rag
'''
from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data

pdf_text = extract_pdf_text('c:\\workspace\\python\\csv\\docs\\samcyangResume_Gen123.pdf')
pdf_texts = text_splitter.split_text(pdf_text)
split_docs = text_splitter.create_documents(pdf_texts)
'''

import uuid
import hashlib
from PyPDF2 import PdfReader
def extract_pdf_text(file_path):
    pdf_file = PdfReader(file_path)
    text_data = ''
    for pg in pdf_file.pages:
        text_data += pg.extract_text()
    return text_data


resume_dir = ".\\docs\\"
pdf_text = []


#Change 1
embeddings = OpenAIEmbeddings()
vectorstore = Chroma("langchain", embeddings)

def create_uuid_from_string(val: str):
    hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
    return uuid.UUID(hex=hex_string)

for file in os.listdir(resume_dir):
    filepath = os.path.join(resume_dir,file)
    collection = vectorstore.get('langchain')
    if (filepath.endswith('.pdf')): 
        #Change 2
        pdf_text=[]
        pdf_text.append ("File Name: "+filepath+"  \n"+extract_pdf_text(filepath))
        split_docs = text_splitter.create_documents(pdf_text)
        existing = vectorstore.get(file)
       
        if (existing['ids'] != [] and existing['ids'][0] == file):
            print("Deleting Duplication .....", file)
            vectorstore.delete(file)
        # Add documents back to collection
        try:
                        #print("Split Doc   ", split_docs)
            #Need to provide IDS list to add_documents, otherwise, it will only pick up the first character of the file name
            langchain_ids = vectorstore.add_documents(ids=[file], documents=split_docs) 
            print("Adding Langchain ID - ", langchain_ids, " File Name - ", file)
            #langchain_ids should be equal to file str
        except:
            #print("Again....Deleting Duplication .....", file)
            vectorstore.delete(file)
            #print("Existing... ", existing['ids'])
            print("Can't add.. ", file)


In [None]:
#Retrieving
# RAG-- Common Rag
#Change 1
'''vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))
'''
retriever = vectorstore.as_retriever(search_kwargs={"k": 2}, max_tokens_limit=10000)

In [None]:
#RAG-- Simple Indexing does not work well; 
#Retrieving the relevant embedding documents 
# 
docs = retriever.get_relevant_documents(question)
len(docs)
docs

In [None]:
#RAG-Fusion

from langchain.prompts import ChatPromptTemplate

template = """You are a helpful assistant that generates multiple sub-questions related to an input question about a resume. \n
The goal is generate multiple search queries related to the experience listed in a resume, and each sub-question that can be answers in separately \n
However, you must keep the main point in the original questions \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
# RAG-Fusion: Related
'''
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
'''
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [None]:
#RAG-Fusion
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0)
generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)



In [None]:
  #stey by step of above generate_queries
  '''
  prompt_perspectives_output =  prompt_rag_fusion.invoke(question)
    #print("Output after prompt_perspectives:", prompt_perspectives_output)

  chat_openai_output = ChatOpenAI(temperature=0)(prompt_perspectives_output)
    ##print("Output after ChatOpenAI:", chat_openai_output)
    #output_parser = CommaSeparatedListOutputParser()
  parser_output = StrOutputParser().invoke(chat_openai_output)
    #print("output after StrOutputParser:", parser_output)
    #str_output_parser_output = output_parser(chat_openai_output)
    #print("Output after StrOutputParser:", str_output_parser_output)

  final_output = (lambda x: x.split("\n"))(parser_output)
    #print("Final Output:", final_output)

  generate_queries = final_output
  final_output
'''

In [None]:
#RAG-Fusion
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}
    print ("results....", results)
    # Iterate through each list of ranked documents
    for docs in results:
        print ("docs... ", docs)
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
           
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            print("rank...", rank, "doc..", doc_str)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)
        print ("Fused Scores::", len(fused_scores))
        for i in fused_scores: print("....", fused_scores[i])
    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    print(reranked_results)
    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

In [None]:
docs

In [None]:
#RAG-Fusion
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
# RAG
template = """Answer the following question based on these context \n
and return two documents that can best match the question. Please note, the first line
of each document contains the filename, you only need to return the file names.:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [None]:
# step_by_step_fusion())

import os
from dotenv import load_dotenv
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import uuid
import hashlib
from PyPDF2 import PdfReader
from langchain.load import dumps, loads
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
question = "Please identify all the documents that have the robotic experience"

def step_by_step_fusion(question_input, template_input):
    question = question_input
    load_dotenv()
    #if (GCP_PROJECT_ID == None): print ("Not set")
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')


    os.environ['LANGCHAIN_TRACING_V2'] = 'false'  #true for trace
    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
    os.environ["LANGCHAIN_PROJECT"] = "RAG Fusion"

    def create_uuid_from_string(val: str):
        hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
        return uuid.UUID(hex=hex_string)

    def extract_pdf_text(file_path):
        pdf_file = PdfReader(file_path)
        text_data = ''
        for pg in pdf_file.pages:
            text_data += pg.extract_text()
        return text_data

    def reciprocal_rank_fusion(results: list[list], k=60):
        """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
            and an optional parameter k used in the RRF formula """
    
        # Initialize a dictionary to hold fused scores for each unique document
        fused_scores = {}
        #print ("results....", results)
        # Iterate through each list of ranked documents
        for docs in results:
            #print ("docs... ", docs)
            # Iterate through each document in the list, with its rank (position in the list)
            for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
                doc_str = dumps(doc)
                #print("rank...", rank, "doc..", doc_str)
                # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
                if doc_str not in fused_scores:
                    fused_scores[doc_str] = 0
                # Retrieve the current score of the document, if any
                previous_score = fused_scores[doc_str]
                # Update the score of the document using the RRF formula: 1 / (rank + k)
                fused_scores[doc_str] += 1 / (rank + k)
            #print ("Fused Scores::", len(fused_scores))
            #for i in fused_scores: 
                #print("....", fused_scores[i])
        # Sort the documents based on their fused scores in descending order to get the final reranked results
        reranked_results = [
            (loads(doc), score)
            for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
        ]
        #print(reranked_results)
        # Return the reranked results as a list of tuples, each containing the document and its fused score
        return reranked_results


    #---------------------------Start
    resume_dir = ".\\docs\\"
    pdf_text = []
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma("langchain", embeddings)
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
    text_splitter = CharacterTextSplitter()

    for file in os.listdir(resume_dir):
        filepath = os.path.join(resume_dir,file)
        collection = vectorstore.get('langchain')
        if (filepath.endswith('.pdf')): 
            #Change 2
            pdf_text=[]
            pdf_text.append ("File Name: "+filepath+"  \n"+extract_pdf_text(filepath))
            split_docs = text_splitter.create_documents(pdf_text)
            existing = vectorstore.get(file)
        
            if (existing['ids'] != [] and existing['ids'][0] == file):
                #print("Deleting Duplication .....", file)
                vectorstore.delete(file)
            # Add documents back to collection
            try:
                #print("Split Doc   ", split_docs)
                #Need to provide IDS list to add_documents, otherwise, it will only pick up the first character of the file name
                langchain_ids = vectorstore.add_documents(ids=[file], documents=split_docs) 
                #print("Adding Langchain ID - ", langchain_ids, " File Name - ", file)
                #langchain_ids should be equal to file str
            except:
                #print("Again....Deleting Duplication .....", file)
                vectorstore.delete(file)
                #print("Existing... ", existing['ids'])
                print("Can't add.. ", file)

    retriever = vectorstore.as_retriever(search_kwargs={"k": 2}, max_tokens_limit=10000)           
    docs = retriever.get_relevant_documents(question)
    len(docs)  
    



    prompt_perspectives_output =  prompt_rag_fusion.invoke(question)
        #print("Output after prompt_perspectives:", prompt_perspectives_output)

    chat_openai_output = ChatOpenAI(temperature=0)(prompt_perspectives_output)
        ##print("Output after ChatOpenAI:", chat_openai_output)
        #output_parser = CommaSeparatedListOutputParser()
    parser_output = StrOutputParser().invoke(chat_openai_output)
        #print("output after StrOutputParser:", parser_output)
        #str_output_parser_output = output_parser(chat_openai_output)
        #print("Output after StrOutputParser:", str_output_parser_output)

    final_output = (lambda x: x.split("\n"))(parser_output)
        #print("Final Output:", final_output)



    retriever_output = retriever.map().invoke(final_output)

    reciprocal_output = reciprocal_rank_fusion(retriever_output)

    template = template_input + """

    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    prompt_output = prompt.invoke({"context": reciprocal_output, "question":question})
    #print("\n\n")
    #print("prompt_output...", prompt_output)
    llm_output = llm(prompt_output)
    #print("Output from llm again", llm_output)
    print (llm_output.content)

In [None]:
question = "Please identify all the documents that have the robotic experience"
template =  """Answer the following question based on these context \n
    and return two documents that can best match the question. Please note, the first line
    of each document contains the filename, you only need to return the file names.:
    """
step_by_step_fusion(question, template)


In [91]:
question = "Please list all the companies that Sam Yang has worked for"
template =  """Answer the following question based on these context \n"""
step_by_step_fusion(question, template)


1. Tarana Wireless
2. Miso Robotics
3. Quasar Science
4. Riverbed/Xirrus Inc
5. JigoCity/Ecommerce
6. Concordware International/China Offshore Software Development
7. Asoka USA
8. Boingo Wireless
9. Infospace Mobile Division
10. Telemac
11. Fiserv - Chase Credit System
12. Frontera
13. Americast
14. UCLA Computer Science Dept
15. Hughes (Aircraft) Research Lab
16. Symbolics Inc
17. Protocol Computer Inc


In [92]:
question = "Please list the name who has both Hardware and Software development experience"
template =  """Answer the following question based on these context and provide the name, email and phone number\n"""
step_by_step_fusion(question, template)


Name: Sam C. Yang
Email: samcyang2004@yahoo.com
Phone Number: (818) 929-9292


In [None]:
#cleanup the vectordb
def clean_vectorstore():
    embeddings = OpenAIEmbeddings()
    ectorstore = Chroma("langchain", embeddings)
    ids = vectorstore.get().get('ids')
    len(ids)
    print("ids ", ids)
    for id in ids: 
        docs = vectorstore.get(id)
        print("docs ", docs)
        print(id)
        vectorstore.delete(id)


In [None]:
# Embedding and Indexing Example 
# 
# index Want is this section???? Using Embedding. 
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)