In [100]:
!pip install llama-index transformers



# Setup

#### Download Data

In [101]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2024-02-11 14:04:50--  https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-02-11 14:04:50 (6.32 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



#### Load Data

In [102]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader("./data/paul_graham/")
documents = reader.load_data()

In [103]:
# documents[1]

# Building QA System with OpenSource LLM

In [104]:
from llama_index.llms.anyscale import Anyscale
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
import openai

ANYSCALE_ENDPOINT_TOKEN = "esecret_zlrv9emfhnpx8gsqlhy7nqw3tz"
openai.api_key = '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'

# Define LLM
llm = Anyscale(model = "meta-llama/Llama-2-70b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# model = 'mistralai/Mistral-7B-Instruct-v0.1'

# Define Embedding Model
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
)

# Create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [105]:
# Setup Query Engine
query_engine = index.as_query_engine()

In [106]:
query_engine.get_prompts()

{'response_synthesizer:text_qa_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x7ffb89e15b40>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageRole.

In [107]:
response = query_engine.query("why did paul graham start YC?")

In [108]:
from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Building QA System with OpenSource LLM and Embeddings.

In [109]:
from llama_index.embeddings import HuggingFaceEmbedding

# Define LLM
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
)

# Create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [110]:
query_engine = index.as_query_engine(
    response_mode="tree_summarize",
    verbose=True,
    # streaming=True
)

In [111]:
query_engine.get_prompts()

{'response_synthesizer:summary_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.SUMMARY: 'summary'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.SUMMARY: 'summary'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information from multiple sources is below.\n---------------------\n{context_str}\n---------------------\nGiven the information from multiple sources and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x7ffb89e15b40>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessag

In [26]:
# Setup Query Engine
# query_engine = index.as_query_engine()

In [34]:
# %%timeit -r 1 -n 1

response = query_engine.query("why did paul graham start YC?")
print(response.response)
# response.print_response_stream()


  Based on the context information provided, Paul Graham started Y Combinator (YC) for the following reasons:

1. To create a long-lasting organization: Graham wanted YC to last for a long time, and to do that, it couldn't be controlled by the founders. He wanted to hand over the reins to someone else.
2. To engage in varied and engaging work: Working at YC provided Graham with a diverse range of problems to solve, which he found engaging.
3. To learn as much as possible about startups in a short amount of time: YC provided Graham with the opportunity to learn about startups in a hands-on manner.
4. To work hard and set an example for others: Graham believed that if he worked hard, it would set the upper bound for how hard everyone else worked.
5. To make sure YC wasn't the last cool thing he did: Graham was advised by Robert Morris to make sure YC wasn't the last cool thing he did, which made him think about leaving YC eventually.

In summary, Graham started YC to create a long-lastin

# Customizing the chunk size, chunk overlap and LLM context window, number of output tokens.

In [96]:
from llama_index import ServiceContext, LLMPredictor, PromptHelper, VectorStoreIndex
# from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms.anyscale import Anyscale

# Define LLM
# ANYSCALE_ENDPOINT_TOKEN = "esecret_whbayj13i6y9xs1qj78rgx4ztm"
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# default chunk size is 1024, while the default chunk overlap is 20.

# Create Node Parser
node_parser = SimpleNodeParser.from_defaults(chunk_size=2000, chunk_overlap=100)

# Create PromptHelper
prompt_helper = PromptHelper(
  context_window=4096,
  num_output=512,
  chunk_overlap_ratio=0.1,
)

# Customise LLM, Embedding model, Node parser and Prompthelper
service_context = ServiceContext.from_defaults(
  llm=llm,
  embed_model=embed_model,
  # node_parser=node_parser,
  # prompt_helper=prompt_helper
)

# Create Index
index = VectorStoreIndex.from_documents(documents, service_context = service_context)

In [97]:
# Setup Query Engine
query_engine = index.as_query_engine()
# similarity_top_k=4)

In [99]:
for i in query_engine.get_prompts():
  print(i,query_engine.get_prompts()[i])

response_synthesizer:text_qa_template metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>} template_vars=['context_str', 'query_str'] kwargs={} output_parser=None template_var_mappings={} function_mappings={} default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: ') conditionals=[(<function is_chat_model at 0x7ffb89e15b40>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content="You ar

In [59]:
response = query_engine.query("why did paul graham start YC?")

from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Saving and Loading the Index

 Storage context.

The storage context container is a utility container for storing nodes, indices, and vectors. It contains the following:
 - docstore: BaseDocumentStore
 - index_store: BaseIndexStore
 - vector_store: VectorStore
 - graph_store: GraphStor

In [60]:
from llama_index import StorageContext, load_index_from_storage
from llama_index.node_parser import SimpleNodeParser

# create parser and parse document into nodes
node_parser = SimpleNodeParser.from_defaults(chunk_size=2000, chunk_overlap=100)
nodes = node_parser.get_nodes_from_documents(documents)

# create storage context using default stores
storage_context = StorageContext.from_defaults()

# # build index
index = VectorStoreIndex(nodes, storage_context=storage_context, service_context = service_context)

# save index
index.storage_context.persist(persist_dir="storage")

In [63]:
import json
json.load(open("/content/storage/default__vector_store.json","r")).keys()

dict_keys(['embedding_dict', 'text_id_to_ref_doc_id', 'metadata_dict'])

In [66]:
json.load(open("/content/storage/default__vector_store.json","r"))['embedding_dict'].keys()

dict_keys(['eb100729-91aa-4c64-bba5-c2ee79fd52d6', '710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7', 'd12ea977-84c0-42f7-aacc-4e3aeabdeef1', 'a397da08-66ba-41b5-a143-431a9e2eb10f', '8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6', '962c03e1-eb7e-4bfc-a864-26a5d136e085', 'a1569fda-558b-4a1a-9b42-3bdf79d72ee7', '6c67ddbb-d04f-42f1-b819-4cc75a3e00a0', '0404d9f7-0589-4219-bea1-1192a1d45d23', '92f71346-9ca0-484b-b492-5475c1ac18bd'])

In [67]:
json.load(open("/content/storage/default__vector_store.json","r"))['text_id_to_ref_doc_id']

{'eb100729-91aa-4c64-bba5-c2ee79fd52d6': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 'd12ea977-84c0-42f7-aacc-4e3aeabdeef1': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 'a397da08-66ba-41b5-a143-431a9e2eb10f': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '962c03e1-eb7e-4bfc-a864-26a5d136e085': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 'a1569fda-558b-4a1a-9b42-3bdf79d72ee7': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '6c67ddbb-d04f-42f1-b819-4cc75a3e00a0': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '0404d9f7-0589-4219-bea1-1192a1d45d23': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 '92f71346-9ca0-484b-b492-5475c1ac18bd': '50fc1efd-10d8-41d4-91a0-f1acf963cc75'}

In [69]:
json.load(open("/content/storage/default__vector_store.json","r"))['metadata_dict'].keys()

dict_keys(['eb100729-91aa-4c64-bba5-c2ee79fd52d6', '710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7', 'd12ea977-84c0-42f7-aacc-4e3aeabdeef1', 'a397da08-66ba-41b5-a143-431a9e2eb10f', '8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6', '962c03e1-eb7e-4bfc-a864-26a5d136e085', 'a1569fda-558b-4a1a-9b42-3bdf79d72ee7', '6c67ddbb-d04f-42f1-b819-4cc75a3e00a0', '0404d9f7-0589-4219-bea1-1192a1d45d23', '92f71346-9ca0-484b-b492-5475c1ac18bd'])

In [70]:
json.load(open("/content/storage/default__vector_store.json","r"))['metadata_dict']['eb100729-91aa-4c64-bba5-c2ee79fd52d6']

{'file_path': 'data/paul_graham/paul_graham_essay.txt',
 'file_name': 'paul_graham_essay.txt',
 'file_type': 'text/plain',
 'file_size': 75042,
 'creation_date': '2024-02-11',
 'last_modified_date': '2024-02-11',
 'last_accessed_date': '2024-02-11',
 '_node_type': 'TextNode',
 'document_id': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 'doc_id': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
 'ref_doc_id': '50fc1efd-10d8-41d4-91a0-f1acf963cc75'}

In [73]:
json.load(open("/content/storage/docstore.json","r")).keys()

dict_keys(['docstore/data', 'docstore/metadata', 'docstore/ref_doc_info'])

In [75]:
json.load(open("/content/storage/docstore.json","r"))['docstore/data'].keys()

dict_keys(['eb100729-91aa-4c64-bba5-c2ee79fd52d6', '710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7', 'd12ea977-84c0-42f7-aacc-4e3aeabdeef1', 'a397da08-66ba-41b5-a143-431a9e2eb10f', '8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6', '962c03e1-eb7e-4bfc-a864-26a5d136e085', 'a1569fda-558b-4a1a-9b42-3bdf79d72ee7', '6c67ddbb-d04f-42f1-b819-4cc75a3e00a0', '0404d9f7-0589-4219-bea1-1192a1d45d23', '92f71346-9ca0-484b-b492-5475c1ac18bd'])

In [77]:
json.load(open("/content/storage/docstore.json","r"))['docstore/data']['710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7']

{'__data__': {'id_': '710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7',
  'embedding': None,
  'metadata': {'file_path': 'data/paul_graham/paul_graham_essay.txt',
   'file_name': 'paul_graham_essay.txt',
   'file_type': 'text/plain',
   'file_size': 75042,
   'creation_date': '2024-02-11',
   'last_modified_date': '2024-02-11',
   'last_accessed_date': '2024-02-11'},
  'excluded_embed_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'excluded_llm_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'relationships': {'1': {'node_id': '50fc1efd-10d8-41d4-91a0-f1acf963cc75',
    'node_type': '4',
    'metadata': {'file_path': 'data/paul_graham/paul_graham_essay.txt',
     'file_name': 'paul_graham_essay.txt',
     'file_type': 'text/plain',
     'file_size': 75042,
     'creation_date': '2024-02-11',
     'last_modified_date': '2024-0

In [78]:
json.load(open("/content/storage/index_store.json","r"))

{'index_store/data': {'eea2b972-a692-462b-b5c1-6ac197744ac6': {'__type__': 'vector_store',
   '__data__': '{"index_id": "eea2b972-a692-462b-b5c1-6ac197744ac6", "summary": null, "nodes_dict": {"eb100729-91aa-4c64-bba5-c2ee79fd52d6": "eb100729-91aa-4c64-bba5-c2ee79fd52d6", "710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7": "710fa5a8-67e6-4c49-a294-ceeb7b4fd9e7", "d12ea977-84c0-42f7-aacc-4e3aeabdeef1": "d12ea977-84c0-42f7-aacc-4e3aeabdeef1", "a397da08-66ba-41b5-a143-431a9e2eb10f": "a397da08-66ba-41b5-a143-431a9e2eb10f", "8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6": "8c05e3ef-da0e-4cbe-9a3f-aaad2568b0c6", "962c03e1-eb7e-4bfc-a864-26a5d136e085": "962c03e1-eb7e-4bfc-a864-26a5d136e085", "a1569fda-558b-4a1a-9b42-3bdf79d72ee7": "a1569fda-558b-4a1a-9b42-3bdf79d72ee7", "6c67ddbb-d04f-42f1-b819-4cc75a3e00a0": "6c67ddbb-d04f-42f1-b819-4cc75a3e00a0", "0404d9f7-0589-4219-bea1-1192a1d45d23": "0404d9f7-0589-4219-bea1-1192a1d45d23", "92f71346-9ca0-484b-b492-5475c1ac18bd": "92f71346-9ca0-484b-b492-5475c1ac18bd"}, "doc_id

In [79]:
# to load index later, make sure you setup the storage context
# this will loaded the persisted stores from persist_dir
storage_context = StorageContext.from_defaults(persist_dir="storage")

# then load the index object
# if loading multiple indexes from a persist dir
loaded_index = load_index_from_storage(storage_context = storage_context, service_context=service_context)

# setup query engine
query_engine = loaded_index.as_query_engine(similarity_top_k=3)
response = query_engine.query("why did paul graham start YC?")

# print the synthesized response.
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Count Prompt Tokens and Checking underlying Prompt

 It provides a way to call handlers on event starts/ends and traces the current stack of events. The CallbackManager is used to help debug, track, and trace the inner workings of the library. It allows the addition of as many callbacks as needed and can be used to log data related to events, track the duration and number of occurrences of each event, and record a trace map of events. Additionally, it can be used to manage callback functions in applications, enabling developers to organize and handle callback functions more efficiently

In [80]:
from llama_index import set_global_service_context
from llama_index.callbacks import CallbackManager, TokenCountingHandler
import tiktoken
from llama_index.llms import OpenAI


token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

callback_manager = CallbackManager([token_counter])

llm = OpenAI(model='gpt-3.5-turbo')

service_context = ServiceContext.from_defaults(
    llm=llm, callback_manager=callback_manager
)

# set the global default!
set_global_service_context(service_context)

In [81]:
index = VectorStoreIndex.from_documents(documents)

In [82]:
print(token_counter.total_embedding_token_count)

20723


Let's reset embedding count.



In [83]:
token_counter.reset_counts()

In [84]:
print(token_counter.total_embedding_token_count)

0


In [85]:
query_engine = index.as_query_engine(similarity_top_k=4)
response = query_engine.query("Why did author start YC?")

In [86]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  7 
 LLM Prompt Tokens:  4561 
 LLM Completion Tokens:  137 
 Total LLM Token Count:  4698 



In [92]:
print(token_counter.llm_token_counts[1].prompt)

user: You are an expert Q&A system that strictly operates in two modes when refining existing answers:
1. **Rewrite** an original answer using the new context.
2. **Repeat** the original answer if the new context isn't useful.
Never reference the original answer or context directly in your answer.
When in doubt, just repeat the original answer.New Context: white V on a red circle, so I made the YC logo a white Y on an orange square.

[14] YC did become a fund for a couple years starting in 2009, because it was getting so big I could no longer afford to fund it personally. But after Heroku got bought we had enough money to go back to being self-funded.

[15] I've never liked the term "deal flow," because it implies that the number of new startups at any given time is fixed. This is not only false, but it's the purpose of YC to falsify it, by causing startups to be founded that would not otherwise have existed.

[16] She reports that they were all different shapes and sizes, because ther

In [87]:
print("prompt: ", token_counter.llm_token_counts[0].prompt, "...\n")
print(
    "prompt token count: ", token_counter.llm_token_counts[0].prompt_token_count, "\n"
)

print("completion: ", token_counter.llm_token_counts[0].completion, "...\n")
print(
    "completion token count: ",
    token_counter.llm_token_counts[0].completion_token_count,
    "\n",
)

print("total token count", token_counter.llm_token_counts[0].total_token_count)

prompt:  system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
file_path: data/paul_graham/paul_graham_essay.txt

When I was dealing with some urgent problem during YC, there was about a 60% chance it had to do with HN, and a 40% chance it had do with everything else combined. [17]

As well as HN, I wrote all of YC's internal software in Arc. But while I continued to work a good deal in Arc, I gradually stopped working on Arc, partly because I didn't have time to, and partly because it was a lot less attractive to mess around with the language now that we had all this infrastructure depending on it. So now my three projects wer