In [6]:
# Import deps
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [7]:
# Load document
loader = TextLoader('../docs/raw.txt')
docs = loader.load()

In [8]:
# Transform into chunks
text_splitter = CharacterTextSplitter(chunk_size=10, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

Created a chunk of size 102, which is longer than the specified 10
Created a chunk of size 295, which is longer than the specified 10
Created a chunk of size 229, which is longer than the specified 10


In [9]:
# Compare the size of docs and texts
len(docs)

1

In [10]:
len(texts)

4

In [11]:
# Visualize chunks and doc
print(docs)

[Document(page_content='Batman does not posess any superpowers, instead relying on his intellect, fighting skills, and wealth.\n\nAs a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallsville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.\n\nIn her homeland, the island nation of Themiyscira, her official title is Princess Diana of Themiyscira. She is known for her compassion, strength, and commitment to justice, serving as a symbol of female empowerment and equality.\n\nThis new Flash was Barry Allen, a police scientist who gained super-speed when blasted by chemicals after a shelf of them was struck by lightning. He adopted the name The Scarlet Speedster after reading a comic book featuring the Golden Age Flash.', metadata={'source': '../docs/raw.txt'})]


In [12]:
print(texts[0])

page_content='Batman does not posess any superpowers, instead relying on his intellect, fighting skills, and wealth.' metadata={'source': '../docs/raw.txt'}


In [13]:
# Import deps
from langchain.embeddings import LlamaCppEmbeddings

In [14]:
embeddings = LlamaCppEmbeddings(model_path="../models/llama-2-7b.Q2_K.gguf")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b.Q2_K.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q2_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q3_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q2_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q3_K     [  4096,  4096,     1,     1 ]
llama

In [15]:
# Convert langchain docs to str
_texts = []
for i in range(len(texts)):
    _texts.append(texts[i].page_content)

In [16]:
texts[0]

Document(page_content='Batman does not posess any superpowers, instead relying on his intellect, fighting skills, and wealth.', metadata={'source': '../docs/raw.txt'})

In [17]:
_texts[0]

'Batman does not posess any superpowers, instead relying on his intellect, fighting skills, and wealth.'

In [18]:
# Embed list of texts
embedded_texts = embeddings.embed_documents(_texts)


llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   26874.01 ms /    26 tokens ( 1033.62 ms per token,     0.97 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   26886.47 ms

llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   67546.42 ms /    74 tokens (  912.79 ms per token,     1.10 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   67567.23 ms

llama_print_timings:        load time =    7670.66 ms
l

In [19]:
# Import deps
from langchain.vectorstores import Chroma

In [20]:
# Create a chroma vectorstore from a list of documents
db = Chroma.from_documents(texts, embeddings)


llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   24213.46 ms /    26 tokens (  931.29 ms per token,     1.07 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   24220.69 ms

llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   67042.90 ms /    74 tokens (  905.99 ms per token,     1.10 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   67061.19 ms

llama_print_timings:        load time =    7670.66 ms
l

In [21]:
# Perform similarity search with the query over db
query = "Who is an orphan here"
docs = db.similarity_search(query, k=1)
docs


llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    6081.69 ms /     7 tokens (  868.81 ms per token,     1.15 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    6083.71 ms


[Document(page_content='As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallsville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.', metadata={'source': '../docs/raw.txt'})]

In [22]:
# Search for documents using query vector
query = "Who is an orphan here"
query_vector = embeddings.embed_query(query)
docs = db.similarity_search_by_vector(query_vector, k=1)
docs


llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    6174.98 ms /     7 tokens (  882.14 ms per token,     1.13 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    6177.40 ms


[Document(page_content='As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallsville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.', metadata={'source': '../docs/raw.txt'})]

In [23]:
# Import deps
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [24]:
# Craft prompt template that works best for our LLM
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say you don't know, don't try to make up an answer.
{context}
Question: {question}
Answer:"""

In [25]:
# Context is the similar document we find in the vector db, question will be query
prompt = PromptTemplate .from_template(template)
prompt.input_variables

['context', 'question']

In [27]:
query = "Who is an orphan here?"

In [26]:
similar_doc = db.similarity_search(query, k=1)
context = similar_doc[0].page_content
context


llama_print_timings:        load time =    7670.66 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    6314.89 ms /     7 tokens (  902.13 ms per token,     1.11 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    6317.23 ms


'As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallsville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.'

In [31]:
# Import deps
from langchain.llms import LlamaCpp

In [32]:
# Import LLM
llm = LlamaCpp(model_path="../models/llama-2-7b.Q2_K.gguf")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b.Q2_K.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q2_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q3_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q2_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q3_K     [  4096,  4096,     1,     1 ]
llama

In [36]:
# Use LLM to generate an answer from the context
query_llm = LLMChain(prompt=prompt, llm=llm)
response = query_llm.run({"context": context, "question": query})

Llama.generate: prefix-match hit

llama_print_timings:        load time =    6227.32 ms
llama_print_timings:      sample time =       4.85 ms /    22 runs   (    0.22 ms per token,  4534.21 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  138846.24 ms /    22 runs   ( 6311.19 ms per token,     0.16 tokens per second)
llama_print_timings:       total time =  138956.16 ms
