In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

from dotenv import load_dotenv, find_dotenv
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
load_dotenv(find_dotenv())
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY")

In [3]:
loader = PyPDFLoader("/media/sf_VMShare/C13_Networking.pdf")

#Load the document by calling loader.load()
pages = loader.load()

print(len(pages))
print(pages[0].page_content[0:500])

print(pages[0].metadata)

100
620Fundamentals of
Networking and
Network Protocols
Imagine that your computer has no connectivity to the Internet. Would we consider such
a computer fully functional? Probably not. Although we take the Internet and network
connectivity for granted, it is a revelation to learn how we got to this point in the first
place. We will do such a historical review at the end of this chapter. First, we will under-
stand the basic elements of networking that allow computers to talk to one another, be
they
{'source': '/media/sf_VMShare/C13_Networking.pdf', 'page': 0}


In [4]:
rsplitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=10)
splits = rsplitter.split_documents(pages)
print(len(splits))
print(len(pages))

552
100


In [5]:
# https://medium.com/international-school-of-ai-data-science/implementing-rag-with-langchain-and-hugging-face-28e3ea66c5f7

# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Create the vector store
persist_directory = 'docs/chroma/'
vectorDB = Chroma.from_documents(documents=splits,
                                     embedding=embeddings,
                                     persist_directory=persist_directory)
print(vectorDB._collection.count())

840


In [7]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="/media/sf_VMShare/zephyr-7b-beta.Q2_K.gguf",
    temperature=0,
    max_tokens=2000,
    top_p=1,
    n_ctx=512,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /media/sf_VMShare/zephyr-7b-beta.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 l

In [8]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)# Run chain
qaChain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorDB.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )

In [9]:
def AskMe(question):
  result = qaChain({"query": question})
  return result["result"]

In [10]:
# Pass question to the qa_chain
question = "What are major Network Layers?"
print(AskMe(question))

  warn_deprecated(


ValueError: Requested tokens (521) exceed context window of 512