# MVP

Startup: Isearch

Done by: Sebastian Sarasti

This notebook aims to be a guide of how to build a elastic search engine to search into different documents.

## Data loading

In [1]:
from langchain.vectorstores import Chroma

Load all pdfs from the data folder, show a progress bar, and use all threads available

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
loader = DirectoryLoader('../data/', '**/*.pdf', loader_cls=PyPDFLoader, use_multithreading=True)
docs = loader.load()

In [13]:
from langchain.document_loaders import Docx2txtLoader

In [16]:
word_loader = DirectoryLoader('../data/', '**/*.docx', loader_cls = Docx2txtLoader, use_multithreading=True)
# word_loader = Docx2txtLoader("../data/carpeta3/HORARIOS PARA EL RIEGO DE AGUA megaproyecto ( reformado).docx")
word_docs = word_loader.load()

In [17]:
word_docs

[Document(page_content='JUNTA DE AGUA DE RIEGO Y /O DRENAJE “JESÚS DEL GRAN PODER”\n\nSAN JUAN - PUJILÍ - COTOPAXI\n\n\n\nHORARIOS PARA EL RIEGO DE AGUA POR ASPERSIÓN DEL MEGAPROYECTO\n\n\n\nPRIMER  TURNO ( SABADO) \n\nDE 06H00 HASTA 12H00\n\n\n\n                         \n\n1\n\nLAVERDE AMORES ANTONIO JAVIER\n\n2\n\nHEREDEROS DE GUANO SARATE CARLOS GUILLERMO\n\n3\n\nCHIGUANO SANDOVAL PAOLA  NATALY\n\n4\n\nCHIGUANO SANDOVAL PAOLA  NATALY\n\n5\n\nCHUGCHILAN TIPANTUÑA REIMUNDO\n\n6\n\nAMORES TOSCANO SABINA DE JESÚS\n\n7\n\nHEREDEROS  DE  TROYA AMORES LUIS FERNANDO\n\n8\n\nROMERO GUEVARA AIDA MARÍA\n\n9\n\nHEREDEROS DE ROMERO CALERO SEGUNDO LUCIANO\n\n10\n\nLEMA CHASILOA LUIS GUILLERMO\n\n11\n\nHEREDEROS  DE  TROYA AMORES LUIS FERNANDO\n\n12\n\nROMERO LOVARO RAFAEL MARÍA\n\n13\n\nMONJE NETO JORGE ALCIDES\n\n14\n\nBEDON LOZADA ZOILA EMERITA\n\n15\n\nVACA CÁRDENAS MARÍA\n\n16\n\nBEDON LOZADA ZOILA EMERITA\n\n17\n\nBEDON LOZADA ZOILA EMERITA\n\n18\n\nCALERO GUANO SEGUNDO JORGE\n\n19\n\nROMER

## Text splitting

Once data has been loaded, it has to be splitted into data chunks to be considered useful in the LLM.

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True
)

In [6]:
texts = text_splitter.split_documents(docs)

## Text embeddings

In this part, the chunks are going to be created into embeddings. 

In [7]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [8]:
embedding = SentenceTransformerEmbeddings(model_name = 'intfloat/multilingual-e5-base')

  from .autonotebook import tqdm as notebook_tqdm


## Vector database

In [9]:
from langchain.vectorstores import Chroma

Generate the DB through the embedding

In [10]:
NAME_VECTOR_STORE = 'embeddings-mvp'
# vectorstore = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=NAME_VECTOR_STORE)

Save the embedding in the disk

In [11]:
# vectorstore.persist()

Load vector store from the DB

In [12]:
vectorstore = Chroma(persist_directory=NAME_VECTOR_STORE, embedding_function=embedding)

## Retrieval object

This object helps to get objects that are most likely to answer a question

In [13]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever

In [14]:
embeddings_filter = EmbeddingsFilter(embeddings=embedding, similarity_threshold=0.75)
retriever = vectorstore.as_retriever()

In [15]:
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

In [16]:
query = "En donde se aprueba la renovacion de la licencia de Encode SA"
documents = compression_retriever.get_relevant_documents(query)

## LLM integration

In [17]:
from langchain.chat_models import ChatOpenAI

In [18]:
# load openai key
with open('../openaikey.txt', "r") as file:
    # Read the contents of the file
    OPENAI_KEY = file.read()

In [19]:
llm = ChatOpenAI(openai_api_key=OPENAI_KEY, model_name='gpt-3.5-turbo', temperature=0, max_tokens=250)

Define an schema for the conversation

In [20]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage

In [21]:
def message_processing(input):
    message = [
    SystemMessage(content='Hello, I am a chatbot of the company ... localizada en Argentina. Mi objetivo es ayudarte a encontrar información en tus documentos disponibles.'),
    HumanMessage(content=input),
    ]
    res = llm(message)
    return res

In [22]:
query = "En donde se aprueba la renovacion de la licencia de ..., por favor dame documentos de soporte"

In [23]:
message_processing(query)

AIMessage(content='Lamentablemente, como chatbot, no tengo acceso directo a los documentos de Encode S.A. ni puedo buscar información específica sobre la renovación de licencias de la empresa. Te recomendaría contactar directamente a Encode S.A. para obtener la información que necesitas.', additional_kwargs={}, example=False)

Define an structure to get the source

In [24]:
from langchain.chains import RetrievalQAWithSourcesChain

In [25]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

In [26]:
chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(temperature=0,  openai_api_key = OPENAI_KEY), chain_type="stuff", retriever= compression_retriever)

In [27]:
answer_1=chain({"question": query})

In [28]:
answer_1

{'question': 'En donde se aprueba la renovacion de la licencia de Encode SA, por favor dame documentos de soporte',
 'answer': ' La renovación de la licencia de Encode SA se aprueba por la Resolución Nº 946/2021 de la Secretaria de Innovación Pública.\n',
 'sources': '..\\data\\carpeta2\\ENCODE Manual de Procedimientos de Certificacion Publico v1.12.pdf, ..\\data\\carpeta3\\renovacion-de-la-licencia-del-certificador-licenciado-encodesa.pdf, https://www.boletinoficial.gob.ar/#!DetalleNorma/268174/20220808'}

In [29]:
sources = answer_1['sources']
sources = sources.split(',')
sources = [i.strip() for i in sources]

In [30]:
metadata_final = []

for source in sources:
    for document in documents:
        if source in document.metadata['source']:
            metadata_final.append({'document': source, 'page': document.metadata['page']})


In [31]:
metadata_final

[{'document': '..\\data\\carpeta2\\ENCODE Manual de Procedimientos de Certificacion Publico v1.12.pdf',
  'page': 11},
 {'document': '..\\data\\carpeta2\\ENCODE Manual de Procedimientos de Certificacion Publico v1.12.pdf',
  'page': 66},
 {'document': '..\\data\\carpeta3\\renovacion-de-la-licencia-del-certificador-licenciado-encodesa.pdf',
  'page': 2}]

In [32]:
def get_document_information_retrieval(query, chain, retriever):
    answer_1 = chain({"question": query})
    final_text = answer_1['answer']
    sources = answer_1['sources']
    sources = sources.split(',')
    sources = [i.strip() for i in sources]
    documents = retriever.get_relevant_documents(query)
    document_info_dict = {}
    document_counter = 1

    for source in sources:
        for document in documents:
            if source in document.metadata['source']:
                if source in document_info_dict:
                    document_info_dict[source]['pages'].append(document.metadata['page'])
                else:
                    document_info_dict[source] = {'document': source, 'pages': [document.metadata['page']]}
                    
    document_info = ', '.join([f"Document #{document_counter} :'{entry['document']}', pages: {', '.join(map(str, entry['pages']))}" for entry in document_info_dict.values()])
    final_response = f"{final_text}\n\nPlease check the following documents for more information: {document_info}."

    return final_response        

In [33]:
a = get_document_information_retrieval(query, chain, compression_retriever)

In [34]:
compression_retriever.get_relevant_documents(query)

[_DocumentWithState(page_content='autoridad de aplicación en el momento del licenciamiento de ENCODE S.A., tomando en cuenta finalmente el\ncompromiso reflejado allí por parte del Certificador Licenciado para subsanar cuestiones de índole\ntécnico/administrativas que en el informe se señalan.', metadata={'page': 1, 'source': '..\\data\\carpeta3\\renovacion-de-la-licencia-del-certificador-licenciado-encodesa.pdf', 'start_index': 2955}, state={'embedded_doc': [-0.005786222405731678, 0.032586727291345596, -0.02060757763683796, 0.01574745401740074, 0.025773899629712105, -0.04173695296049118, -0.032051365822553635, -0.048708826303482056, 0.02789979800581932, -0.007466250564903021, 0.014861345291137695, 0.025983065366744995, 0.07594884186983109, 0.07626070827245712, -0.04439229518175125, -0.033620309084653854, 0.01753144897520542, 0.0099038602784276, 0.05329211801290512, 0.005059891380369663, 0.016598889604210854, -0.03851846978068352, 0.03408048301935196, -0.04176807776093483, 0.03363905474

In [35]:
a

" La renovación de la licencia de Encode SA se aprueba por la Resolución Nº 946/2021 de la Secretaria de Innovación Pública.\n\n\nPlease check the following documents for more information: Document #1 :'..\\data\\carpeta2\\ENCODE Manual de Procedimientos de Certificacion Publico v1.12.pdf', pages: 11, 30, Document #1 :'..\\data\\carpeta3\\renovacion-de-la-licencia-del-certificador-licenciado-encodesa.pdf', pages: 1, 2."