In [0]:
!pip install -qU openai
!pip install -qU langchain 
!pip install -qU langchain-openai
!pip install -qU InstructorEmbedding
!pip install -qU transformers
!pip install -qU chromadb
!pip install -qU faiss-cpu
!pip install -qU tiktoken
!pip install -q pydantic==1.10.9  #https://stackoverflow.com/questions/76934579/pydanticusererror-if-you-use-root-validator-with-pre-false-the-default-you
!pip install -q urllib3==1.26.18
!pip install -q requests==2.28.1
!pip install -qU SQLAlchemy
#!pip install -qU docarray
#!pip install -qU python-docx
!pip install -qU pypdf
!pip install -qU docx2txt
#!pip install -qU unstructured[pdf]
!pip install -qU lark

dbutils.library.restartPython()

In [0]:
import os
import sys 
import glob
from pathlib import Path
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import tiktoken
#from funcy import lcat, lmap, linvoke

## Langchain LLM Objects
import openai
from langchain.llms import OpenAI
from langchain_openai import AzureOpenAI
from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI

## Langchain Prompt Templates
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate

## Langchain Chains 
#from langchain.chains import ConversationChain
#from langchain.chains import LLMChain
#from langchain.chains import ConversationChain
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
#from langchain.chains.mapreduce import MapReduceChain
#from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain
#from langchain.chains import SimpleSequentialChain
#from langchain.chains import SequentialChain
#from langchain.chains.router import MultiPromptChain
#from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser

## Langchain Output Parsers 
#from langchain.output_parsers import ResponseSchema
#from langchain.output_parsers import StructuredOutputParser

## Langchain Memory 
#from langchain.memory import ConversationBufferMemory
#from langchain.memory import ConversationBufferWindowMemory
#from langchain.memory import ConversationTokenBufferMemory
#from langchain.memory import ConversationSummaryBufferMemory

## Langchain Text Splitters
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.text_splitter import TokenTextSplitter
#from langchain.text_splitter import MarkdownHeaderTextSplitter

## Langchain Document Object and Loaders
#from docx import Document
from langchain.docstore.document import Document
from langchain.schema import Document as LangchainDocument
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

## Langchain Vector Databases
#from langchain.vectorstores import DocArrayInMemorySearch
#from langchain.vectorstores.base import VectorStore
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

## Langchain  Embedding Models
#from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from InstructorEmbedding import INSTRUCTOR

## Langchain retrievers
from langchain.retrievers.self_query.base import SelfQueryRetriever ## Error
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever

os.environ['TRANSFORMERS_CACHE'] = "/Workspace/ds-academy-research/Models/"

openai.api_type = "azure"
azure_endpoint = "https://rg-rbi-aa-aitest-dsacademy.openai.azure.com/"
#azure_endpoint = "https://chatgpt-summarization.openai.azure.com/"

openai.api_version = "2023-07-01-preview"
openai.api_key = os.environ["OPENAI_API_KEY"]

deployment_name = "model-gpt-35-turbo"
openai_model_name = "gpt-35-turbo"

client = AzureOpenAI(api_key=openai.api_key,
                     api_version=openai.api_version,
                     azure_endpoint=azure_endpoint,
                     )

chat = AzureChatOpenAI(azure_endpoint=azure_endpoint,
                       openai_api_version=openai.api_version,
                       deployment_name=deployment_name,
                       openai_api_key=os.environ["OPENAI_API_KEY"],
                       openai_api_type=openai.api_type,
                       )

### Examining files in the examples folder  
It may be necessary to change the default folder for your documents.

In [0]:
fullpath = "/Workspace/ds-academy-research/Docs/Test_CVs/"
docs = os.listdir(fullpath)
docs = [d for d in docs if d.endswith(".pdf")]
docs.sort()
for doc in docs:
    print(doc)

In [0]:
documents = []
for filename in os.listdir(fullpath):
    if filename.endswith('.pdf'):
        print(f"Ingesting document {filename}")
        pdf_path = fullpath + filename
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
print(f"We have {len(documents)} pages from all the pdf files")

In [0]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=200,
                                               separators=["\n\n", "\n", "\. ", " ", ""],
                                               length_function=len
                                               )
chunked_documents = text_splitter.split_documents(documents)
print(len(chunked_documents))

In [0]:
#openai_embeddings = OpenAIEmbeddings(deployment="model-text-embedding-ada-002", chunk_size = 1)

instruct_embeddings = HuggingFaceInstructEmbeddings(query_instruction="Represent the query for retrieval: ", model_name="hkunlp/instructor-xl") 

In [0]:
files = glob.glob('/Workspace/ds-academy-research/VectorDB_CVs/*')

for f in files:
    os.remove(f)

faissdb = FAISS.from_documents(chunked_documents, 
                                embedding=instruct_embeddings,
                               )
#print(f"There are {vectordb.ntotal} documents in the index")
faissdb.save_local('/Workspace/ds-academy-research/VectorDB_CVs/')

In [0]:
docsearch = FAISS.load_local("/Workspace/ds-academy-research/VectorDB_CVs/", instruct_embeddings)
print(len(docsearch.index_to_docstore_id))

#### Search using a score function and a maximum number of documents in return

In [0]:
query = "Python"
result = docsearch.similarity_search_with_score(query, k=2)
for r in result:
    print(r)
    print()

#### Addressing Diversity: Maximum marginal relevance

`Maximum marginal relevance` strives to achieve both relevance to the query *and diversity* among the results.

In [0]:
query = "Python"
result = docsearch.max_marginal_relevance_search(query, k=2)
for r in result:
    print(r)
    print()

#### Addressing Specificity: working with metadata

To address this, many vectorstores support operations on `metadata`.

`metadata` provides context for each embedded chunk.

In [0]:
query = "Whom is this curriculum about?"
for cv in docs:
    result = docsearch.similarity_search(query, k=1, filter={"source":f"/Workspace/ds-academy-research/Docs/Test_CVs/{cv}"})
    for r in result:
        print(r)
        print()


#### [Stuffing Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)
The Stuffing chain serves as a solution for scenarios where the context length of the LLM is inadequate to handle extensive documents or a substantial amount of information. In such cases, a large document can be divided into smaller segments, and semantic search techniques can be employed to retrieve relevant documents based on the query. These retrieved documents are then “stuffed” into the LLM context, allowing for the generation of a response.

Benefits:

+ Consolidation of multiple documents: The Stuffing chain allows the aggregation of several relevant documents, overcoming the context length limitation of LLMs for large documents.
Comprehensive information processing: By leveraging multiple documents, the chain can generate more comprehensive and relevant answers.

Disadvantages:

+ Increased complexity: Retrieving relevant document requires good semantic search and vector database.
+ Potential loss of contextual coherency: Since we are retrieving N documents, the LLM might not have all relevant context to generate a cohesive answer.

Use Cases:

+ Document Retrieval Question Answering: Utilizing the Stuffing chain, document chunks retrieved of the larger document can be effectively leveraged to provide accurate answers to your questions. For example, suppose you have a lengthy legal document and need to find specific answers to legal questions. By using the Stuffing chain, you can break down the document into smaller chunks, retrieve relevant chunks based on the question, and utilize the information within those chunks to generate accurate answers.
+ Complex Question Answering: When answering complex questions that require information from diverse sources, the Stuffing chain can provide more comprehensive and accurate responses. For instance, imagine you have a research project that requires answering complex scientific queries. The Stuffing chain allows you to divide relevant scientific papers into smaller chunks, retrieve the necessary information from these chunks, and synthesize it to provide a thorough and precise response to the complex question at hand.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="stuff",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])


#### [Map-Reduce Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)
The Map-Reduce chain enables the iteration over a list of documents, generating individual outputs for each document, which can later be combined to produce a final result. This chain is useful for tasks that involve processing documents in parallel and then aggregating the outputs. 

Benefits:

+ Parallel processing: The Map-Reduce chain allows for parallel execution of the language model on individual documents, improving efficiency and reducing processing time.
+ Scalability: The chain can handle large collections of documents by distributing the processing load across multiple iterations.
+ Enhanced information extraction: By generating individual outputs for each document, the chain can extract specific information that contributes to a more comprehensive final result.  

Disadvantages:

+ Complexity in output aggregation: Combining the outputs of multiple iterations requires careful handling to ensure coherency and meaningful synthesis.
+ Potential redundancy: In some cases, the individual outputs of the Map-Reduce chain may contain redundant information, necessitating further post-processing steps.

Use Cases:

+ Multiple document summarization: The Map-Reduce chain can be used to generate summaries for many documents and then to combine the singular summaries to create a final comprehensive summary for the whole group of documents. For example, imagine you have a collection of research papers on a particular topic. By employing the Map-Reduce chain, you can generate summaries for each research paper, and finally merge the individual summaries to produce a comprehensive summary that captures the key information from the entire collection of papers. This approach enables efficient and accurate summarization of large volumes of documents.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="map_reduce",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### [Refine Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)  
The Refine chain focuses on iterative refinement of the output by feeding the output of one iteration into the next, aiming to enhance the accuracy and quality of the final result.

Benefits:

+ Continuous improvement: The Refine chain allows for progressive refinement of the output by iteratively updating and enhancing the information.
Enhanced accuracy: By refining the output in each iteration, the chain can improve the accuracy and relevance of the final result.

Disadvantages:

+ Increased computational resources: The iterative nature of the Refine chain may require additional computational resources compared to non-iterative approaches.
+ Longer processing time: Each iteration adds to the overall processing time, which may be a consideration when real-time or near-real-time responses are required.

Use Cases:

+ Long-form text generation: The Refine chain proves exceptionally valuable in the creation of extensive text compositions, such as essays, articles, or stories, where the iterative refinement process greatly enhances coherence and readability. For instance, envision interacting with a substantial research paper and progressively employing the LLM to craft an abstract, refining it with each iteration to achieve an optimal outcome.
+ Answer synthesis: The Refine chain demonstrates its prowess in synthesizing answers derived from multiple sources or generating comprehensive responses. Through iterative refinement, the chain progressively improves the accuracy and comprehensiveness of the final answer. This capability is especially advantageous when each retrieved document contributes crucial context to the answer generation process.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="refine",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### [Map Rerank](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.map_rerank.MapRerankDocumentsChain.html#)  

Combining documents by mapping a chain over them, then reranking results.
This algorithm calls an LLMChain on each input document. The LLMChain is expected to have an OutputParser that parses the result into both an answer (answer_key) and a score (rank_key). The answer with the highest score is then returned.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="map_rerank",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### Adding Chat History
Now, if we want to take things one step further, we can also make it so that our chatbot will remember any previous questions.

Implementation-wise, all that happens is that on each interaction with the chatbot, all of our previous conversation history, including the questions and answers, needs to be passed into the prompt. That is because the LLM does not have a way to store information about our previous requests, so we must pass in all the information on every call to the LLM.

Fortunately, LangChain also has a set of classes that let us do this out of the box. This is called the ConversationalRetrievalChain, which allows us to pass in an extra parameter called chat_history , which contains a list of our previous conversations with the LLM.

In [0]:
qa_chain = ConversationalRetrievalChain.from_llm(llm=chat,
                                                 retriever=docsearch.as_retriever(),
                                                 return_source_documents=True)

The chain run command accepts the chat_history as a parameter. We must manually build up this list based on our conversation with the LLM.  
The chain does not do this out of the box, so for each question and answer, we will build up a list called chat_history , which we will pass back into the chain run command each time.

In [0]:
chat_history = []
while True:
    # this prints to the terminal, and waits to accept an input from the user
    query = input('Prompt: ')
    # give us a way to exit the script
    if query == "exit" or query == "quit" or query == "q":
        print('Exiting')
        break
    # we pass in the query to the LLM, and print out the response. As well as
    # our query, the context of semantically relevant information from our
    # vector store will be passed in, as well as list of our chat history
    result = qa_chain({'question': query, 'chat_history': chat_history})
    print('Answer: ' + result['answer'])
    # we build up the chat_history list, based on our question and response
    # from the LLM, and the script then returns to the start of the loop
    # and is again ready to accept user input.
    chat_history.append((query, result['answer']))

In [0]:
chat_history