In [1]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('./docs', glob="**/*.txt")
documents = loader.load()


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
texts = text_splitter.split_documents(documents)

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [4]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings, persist_directory="db_local")

In [5]:
from langchain.llms import GPT4All
model_path = "./models/ggml-gpt4all-j-v1.3-groovy.bin"
llm = GPT4All(model=model_path, n_ctx=2048, backend="gptj", verbose=False)

Found model file at  ./models/ggml-gpt4all-j-v1.3-groovy.bin
gptj_model_load: loading model from './models/ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load: ................................... done
gptj_model_load: model size =  3609.38 MB / num tensors = 285


In [7]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    verbose=False
)

The block below will verify we are reading data from the vector database.  It puts a verification.pdf file in the /docs folder with a keyword in it.  You'll see from the response it is putting a colon in the keyword for some reason, so maybe a parsing issue. Also, the 7B local model is not so good.

In [8]:
# Note if you look in the response you can see some limitations. It looks like the parsing put a colon in the keyword, and the local model
res = qa("What is the UMD20 verification keyword? Extract it from the text. Print only the keyword and nothing else.")
print(res)

 UMD
{'query': 'What is the UMD20 verification keyword? Extract it from the text. Print only the keyword and nothing else.', 'result': ' UMD', 'source_documents': [Document(page_content='UMD20 verification keyword is 213sd834d', metadata={'source': 'docs/verification.txt'}), Document(page_content='provider will need to include 9-digit UID number on the check. The check can then be mailed in or dropped off in person. If the check is written to both the Student and The University of Maryland: The student will first need to endorse the check (sign the back of the check on the endorsement line). The student/scholarship provider will also need to include 9-digit UID number on the check. The check can then be mailed in or dropped off in person. If the check is written just to the Student: The student will', metadata={'source': 'docs/bursar.txt'})]}
