In [4]:
import os
from dotenv import load_dotenv

load_dotenv()
MODEL = "llama3"

In [30]:
#@ Importing libraries
# from langchain_chroma import Chroma
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [32]:

# Since I'm using local llama model instead of an OpenAI
from langchain.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

### Create Vector Store using ChromaDB
First we load all the PDF documents using ```PyPDFLoader``` through the ```PyPDFLoader()```. After loading we have to generate embeddings for each document to compare with the question when selecting the documents that provide relevant context. To do that, we first generate chunks using the ```RecursiveCharacterTextSplitter```, splitting the each document. Then we represent each chunk using ```OllamaEmbeddings``` embeddings that utilizes ```llama3``` model. Once the embedding vectors for each chunk generated it will be stored in a database (here we use local ChromaDB) called vecorstore.

### Load the document

In [34]:
parser = StrOutputParser()
loader = PyPDFLoader("introduction-to-natural-language-processing.pdf")
docs = loader.load()

### Split into chunks

In [38]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
text_chunks = text_splitter.split_documents(docs)

#@ Vector store
vectorstore = Chroma.from_documents(text_chunks, embedding=embeddings, persist_directory="db/vectorstore")
vectorstore.persist()

We can test the vector store by calling its similarity_search method with a query as bellow. As you can see, we retrieved a list of for documents related to the question. Note that each document as several fields, namely, page_content and metadata.

In [39]:
vectorstore.similarity_search("What is natural language processing")

[Document(page_content='CO3354 Introduction to natural language processing\nannotation, it is good practice to use multiple annotators for at least a sample of\nthe corpus and report the level of inter-annotator agreement that was\nachieved.\n2. Some uses of corpora include:\nLexicography (compiling dictionaries).\nCompiling grammars for education and reference purposes.\nStylistics: developing techniques to identify the author or genre of a\ndocument; investigating the effect on language use of different channels', metadata={'page': 47, 'source': 'introduction-to-natural-language-processing.pdf'}),
 Document(page_content='CO3354 Introduction to natural language processing\nYour turn . You may also ﬁnd it useful to attempt some of the exercises provided at\nthe end of each chapter.\nFrom this chapter onwards you will be running Python sessions and using the NLTK.\nYou should get into the habit of starting sessions with the following commands:\n>>> from __future__ import division\n>>> i

### Retrieve and generate relevant snippets from the documents

In [40]:
from langchain import PromptTemplate
from multiprocessing import context

template = """
Answer the question based on the context below. If the context is not relevant, just reply as "I don't know"

context: {context}

question: {question}
"""

prompt = PromptTemplate(template = template)

retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {
        "context": retriever | format_docs, "question": RunnablePassthrough()
    }
    | prompt | model | parser
)


In [42]:
chain.invoke("what is brown corpus")

'Based on the context, I can answer that the Brown Corpus refers to a text corpus or collection of texts used for linguistic analysis and statistical modeling. It appears that this particular corpus contains two documents, one from an HTML file and another from a PDF file ("Guide to data protection").'