In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

In [2]:
loader = PyMuPDFLoader("Virtual_characters.pdf")
PDF_data = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
all_splits = text_splitter.split_documents(PDF_data)

In [6]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
model_name = "all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [7]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp, GPT4All

local_path = "llama-2-7b-chat.Q4_K_M.gguf"

# llm = GPT4All(model=local_path, callbacks=CallbackManager([StreamingStdOutCallbackHandler()]), verbose=True)
llm = LlamaCpp(
    model_path=local_path,
    n_gpu_layers=-1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 2 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loade

In [8]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> 
    You are a helpful assistant eager to assist with providing better Google search results.
    <</SYS>> 
    
    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
            relevant, and concise:
            {question} 
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better Google search results. \
        Provide an answer to the following question in about 150 words. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)

QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>> \n    You are a helpful assistant eager to assist with providing better Google search results.\n    <</SYS>> \n    \n    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative,             relevant, and concise:\n            {question} \n    [/INST]')

In [9]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What is Taiwan known for?"
llm_chain.invoke({"question": question})

  Taiwan is known for its rich cultural heritage, stunning natural beauty, and vibrant cities. Some of its most famous attractions include the Taroko Gorge, a breathtaking marble canyon; Sun Moon Lake, the largest lake in Taiwan; and the night markets of Taipei, where visitors can try delicious street food and shop for local souvenirs. Taiwan is also famous for its friendly locals, efficient public transportation system, and low cost of living, making it an ideal destination for travelers on a budget. Additionally, Taiwan has a thriving tech industry and is home to many world-renowned companies, earning it the nickname "Asia's Silicon Valley."


llama_print_timings:        load time =     100.72 ms
llama_print_timings:      sample time =      86.30 ms /   155 runs   (    0.56 ms per token,  1796.14 tokens per second)
llama_print_timings: prompt eval time =     100.57 ms /    84 tokens (    1.20 ms per token,   835.21 tokens per second)
llama_print_timings:        eval time =    1419.38 ms /   154 runs   (    9.22 ms per token,   108.50 tokens per second)
llama_print_timings:       total time =    2165.93 ms /   238 tokens


{'question': 'What is Taiwan known for?',
 'text': '  Taiwan is known for its rich cultural heritage, stunning natural beauty, and vibrant cities. Some of its most famous attractions include the Taroko Gorge, a breathtaking marble canyon; Sun Moon Lake, the largest lake in Taiwan; and the night markets of Taipei, where visitors can try delicious street food and shop for local souvenirs. Taiwan is also famous for its friendly locals, efficient public transportation system, and low cost of living, making it an ideal destination for travelers on a budget. Additionally, Taiwan has a thriving tech industry and is home to many world-renowned companies, earning it the nickname "Asia\'s Silicon Valley."'}

In [10]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [11]:
query = "Tell me about Alison Hawk's career and age"
qa.invoke(query)



[1m> Entering new RetrievalQA chain...[0m
 Alison Hawk is a 28-year

Llama.generate: prefix-match hit


-old female researcher.


llama_print_timings:        load time =     100.72 ms
llama_print_timings:      sample time =       9.52 ms /    18 runs   (    0.53 ms per token,  1890.36 tokens per second)
llama_print_timings: prompt eval time =      65.77 ms /   159 tokens (    0.41 ms per token,  2417.70 tokens per second)
llama_print_timings:        eval time =     155.79 ms /    17 runs   (    9.16 ms per token,   109.12 tokens per second)
llama_print_timings:       total time =     288.46 ms /   176 tokens



[1m> Finished chain.[0m


{'query': "Tell me about Alison Hawk's career and age",
 'result': ' Alison Hawk is a 28-year-old female researcher.'}