In [1]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.llms import LlamaCpp
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from IPython.display import display, HTML
from langchain.chains import LLMChain
import json
import time
import pathlib
from langchain_community.document_loaders import WebBaseLoader

### **Consume Information from the Documents (.txt)**

In [None]:
# define what documents to load
loader = DirectoryLoader("./data/", glob="*.txt", loader_cls=TextLoader)

# interpret information in the documents
documents = loader.load()
splitter = RecursiveCharacterTextSplitter()
texts = splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})

# create and save the local database
db = FAISS.from_documents(texts, embeddings)
db.save_local("faiss")

### **Load data from web**

In [2]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Muhammad_Yunus")

In [3]:
data = loader.load()
splitter = RecursiveCharacterTextSplitter()
texts = splitter.split_documents(data)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})

# create and save the local database
db = FAISS.from_documents(texts, embeddings)
db.save_local("faiss")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data

[Document(page_content='\n\n\n\nMuhammad Yunus - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\nLanguages\n\nLanguage links are at the top of the page.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Early life and education\n\n\n\nToggle Early life and education subsection\n\n\n\n\n\n1.1Ear

#### **Prepare Template**

In [5]:
template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""

#### **load the interpreted information from the local database**

In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})
db = FAISS.load_local("faiss", embeddings)

In [7]:
# prepare a version of the llm pre-loaded with the local content
retriever = db.as_retriever(search_kwargs={'k': 2})

In [8]:
# prompt = PromptTemplate(template=template, input_variables=['context', 'question'])
prompt = PromptTemplate(
    template=template,
    input_variables=['context', 'question'])

In [9]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

### **Load the Model**

In [10]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    # model_path="./llama.cpp/models/quantized_q4_1.gguf",
    model_path="../model/llama-2-7b-chat.gguf.q4_0.bin",
    temperature=0.01,
    max_tokens=1000,
    top_p=1,
    n_ctx=6000,
    # callback_manager=callback_manager,
    # verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../model/llama-2-7b-chat.gguf.q4_0.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama-2-7b-chat.ggmlv3.q4_0.bin
llama_model_loader: - kv   2:                        general.description str              = converted from legacy GGJTv3 MOSTLY_Q...
llama_model_loader: - kv   3:                          general.file_type u32              = 2
llama_model_loader: - kv   4:                       llama.context_length u32              = 2048
llama_model_loader: - kv   5:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   6:                          llama.block_count u32              = 32
llama_model_load

#### **Model Retrieval Chain and Query Function**

In [11]:
QA_LLM = RetrievalQA.from_chain_type(llm=llm,
                                     chain_type='stuff',
                                     retriever=retriever,
                                     return_source_documents=True,
                                     chain_type_kwargs={'prompt': prompt})

def query(model, question):
    # model_path = model.combine_documents_chain.llm_chain.llm.model
    # model_name = pathlib.Path(model_path).name
    time_start = time.time()
    output = model({'query': question})
    response = output["result"]
    time_elapsed = time.time() - time_start
    # display(HTML(f'<code>{model_name} response time: {time_elapsed:.02f} sec</code>'))
    display(HTML(f'<strong>Question:</strong> {question}'))
    display(HTML(f'<strong>Answer:</strong> {response}'))
    # print(output)

#### **Ask your question Here**

In [12]:
query(QA_LLM, "When was Yunus born?")


llama_print_timings:        load time =    6128.37 ms
llama_print_timings:      sample time =      58.56 ms /    24 runs   (    2.44 ms per token,   409.86 tokens per second)
llama_print_timings: prompt eval time = 1399356.23 ms /  1430 tokens (  978.57 ms per token,     1.02 tokens per second)
llama_print_timings:        eval time =   31614.97 ms /    23 runs   ( 1374.56 ms per token,     0.73 tokens per second)
llama_print_timings:       total time = 1433332.32 ms


In [13]:
query(QA_LLM, "Where was Yunus born?")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    6128.37 ms
llama_print_timings:      sample time =      39.19 ms /    15 runs   (    2.61 ms per token,   382.75 tokens per second)
llama_print_timings: prompt eval time =   26923.40 ms /    24 tokens ( 1121.81 ms per token,     0.89 tokens per second)
llama_print_timings:        eval time =   18538.93 ms /    14 runs   ( 1324.21 ms per token,     0.76 tokens per second)
llama_print_timings:       total time =   45758.72 ms


In [14]:
query(QA_LLM, "How many siblings Muhammad Yunus has?")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    6128.37 ms
llama_print_timings:      sample time =    1929.64 ms /  1000 runs   (    1.93 ms per token,   518.23 tokens per second)
llama_print_timings: prompt eval time = 1436161.90 ms /  2214 tokens (  648.67 ms per token,     1.54 tokens per second)
llama_print_timings:        eval time =  989529.69 ms /   999 runs   (  990.52 ms per token,     1.01 tokens per second)
llama_print_timings:       total time = 2442631.69 ms


In [15]:
query(QA_LLM, "What are the contributions of Yunus?")

Llama.generate: prefix-match hit
