In [1]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

In [2]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [3]:
# InstructorEmbedding 
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [4]:
root_dir = './'

In [5]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader(f'./reports/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [6]:
[el.metadata for el in documents]

[{'source': 'reports/2023_GPT4All-J_Technical_Report_2.pdf', 'page': 0},
 {'source': 'reports/2023_GPT4All-J_Technical_Report_2.pdf', 'page': 1},
 {'source': 'reports/2023_GPT4All-J_Technical_Report_2.pdf', 'page': 2},
 {'source': 'reports/2023_GPT4All_Technical_Report.pdf', 'page': 0},
 {'source': 'reports/2023_GPT4All_Technical_Report.pdf', 'page': 1},
 {'source': 'reports/2023_GPT4All_Technical_Report.pdf', 'page': 2}]

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000, 
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [8]:
texts[0]

Document(page_content='GPT4All-J: An Apache-2 Licensed Assistant-Style Chatbot\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzach@nomic.aiBrandon Duderstadt\nbrandon@nomic.ai\nBenjamin M. Schmidt\nben@nomic.aiAdam Treat\ntreat.adam@gmail.comAndriy Mulyar\nandriy@nomic.ai\nAbstract\nGPT4All-J is an Apache-2 licensed chatbot\ntrained over a massive curated corpus of as-\nsistant interactions including word problems,\nmulti-turn dialogue, code, poems, songs, and\nstories. It builds on the March 2023 GPT4All\nrelease by training on a significantly larger\ncorpus, by deriving its weights from the\nApache-licensed GPT-J model rather than the\nGPL-licensed of LLaMA, and by demonstrat-\ning improved performance on creative tasks\nsuch as writing stories, poems, songs and\nplays. We openly release the training data,\ndata curation procedure, training code, and fi-\nnal model weights to promote open research\nand reproducibility. Additionally, we release\nPython bindings and a Chat UI to a qu

In [9]:
len(texts)

22

In [10]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [11]:
def store_embeddings(docs, embeddings, sotre_name, path):
    
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [12]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

In [13]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "mps"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [14]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [15]:
db_instructEmbedd = FAISS.from_documents(texts[:1], instructor_embeddings)

  assert torch.sum(attention_mask[local_idx]).item() >= context_masks[local_idx].item(),\


In [16]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})

In [17]:
retriever.search_type

'similarity'

In [18]:
retriever.search_kwargs

{'k': 3}

In [19]:
docs = retriever.get_relevant_documents("Who are the authors of GPT4All report?")

In [20]:
docs[0]

Document(page_content='GPT4All-J: An Apache-2 Licensed Assistant-Style Chatbot\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzach@nomic.aiBrandon Duderstadt\nbrandon@nomic.ai\nBenjamin M. Schmidt\nben@nomic.aiAdam Treat\ntreat.adam@gmail.comAndriy Mulyar\nandriy@nomic.ai\nAbstract\nGPT4All-J is an Apache-2 licensed chatbot\ntrained over a massive curated corpus of as-\nsistant interactions including word problems,\nmulti-turn dialogue, code, poems, songs, and\nstories. It builds on the March 2023 GPT4All\nrelease by training on a significantly larger\ncorpus, by deriving its weights from the\nApache-licensed GPT-J model rather than the\nGPL-licensed of LLaMA, and by demonstrat-\ning improved performance on creative tasks\nsuch as writing stories, poems, songs and\nplays. We openly release the training data,\ndata curation procedure, training code, and fi-\nnal model weights to promote open research\nand reproducibility. Additionally, we release\nPython bindings and a Chat UI to a qu

## Creating LLama2 LLM

In [21]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


In [22]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="../llama-2-7b-chat.ggmlv3.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

llama.cpp: loading model from ../llama-2-7b-chat.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 3917.73 MB (+  256.00 MB per state)
llama_new_context_with_model: kv self size  =  256.00 MB
ggml_metal_

## Creting LLM Chain

In [23]:
# create the chain to answer questions 
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [24]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [25]:
query = 'who are the authors of GPT4all technical report?'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The authors of the GPT4All technical report are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin M. Schmidt, Adam Treat, and Andriy Mulyar. The authors of the GPT4All technical report are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin
M. Schmidt, Adam Treat, and Andriy Mulyar.

Sources:
reports/2023_GPT4All-J_Technical_Report_2.pdf



llama_print_timings:        load time = 13129.63 ms
llama_print_timings:      sample time =    32.37 ms /    45 runs   (    0.72 ms per token,  1390.00 tokens per second)
llama_print_timings: prompt eval time = 13129.58 ms /   365 tokens (   35.97 ms per token,    27.80 tokens per second)
llama_print_timings:        eval time =  5153.40 ms /    44 runs   (  117.12 ms per token,     8.54 tokens per second)
llama_print_timings:       total time = 18381.38 ms


In [26]:
query = 'What is the topic of the report?'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------



Llama.generate: prefix-match hit


 The topic of the report is the GPT4All-J chatbot, specifically its features, capabilities, and how it was trained. The topic of the report is the GPT4All-J chatbot, specifically its features, capabilities, and how it was
trained.

Sources:
reports/2023_GPT4All-J_Technical_Report_2.pdf



llama_print_timings:        load time = 13129.63 ms
llama_print_timings:      sample time =    22.25 ms /    29 runs   (    0.77 ms per token,  1303.31 tokens per second)
llama_print_timings: prompt eval time =   520.26 ms /    13 tokens (   40.02 ms per token,    24.99 tokens per second)
llama_print_timings:        eval time =   993.83 ms /    28 runs   (   35.49 ms per token,    28.17 tokens per second)
llama_print_timings:       total time =  1992.34 ms
