In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import logging 

logging.basicConfig(level=logging.DEBUG)

In [3]:
import json

In [4]:
import json
with open('../data/llama_index.json') as f:
    raw_json = json.load(f)

In [5]:
filter_json = {k: v for k, v in raw_json.items() if "https://docs.llamaindex.ai" in k}
len(filter_json)

441

In [6]:
from askharrison.crawl.html_to_text import html_to_text

In [7]:
example_json = {k: html_to_text(v) for k, v in raw_json.items() if "example" in k in k}
nonexample_json = {k:  html_to_text(v) for k,v in raw_json.items() if "example" not in k}

len(example_json), len(nonexample_json)

(224, 252)

In [8]:
# export both jsons to data/llama_index_doc
with open('../data/llama_index_doc/nonexample.json', 'w') as f:
    json.dump(nonexample_json, f, indent=4)

with open('../data/llama_index_doc/example.json', 'w') as f:
    json.dump(example_json, f, indent=4)

In [9]:
nonexample_json_content = {k: v.split(" table of contents sidebar")[-1] for k, v in nonexample_json.items() if len(v) > 0}
with open('../data/llama_index_doc/nonexample_content.json', 'w') as f:
    json.dump(nonexample_json_content, f, indent=4)

In [10]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext


# initialize client, setting path to save data
db = chromadb.PersistentClient(path="../data/chroma_db")



INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
DEBUG:chromadb.config:Starting component System
DEBUG:chromadb.config:Starting component Posthog
DEBUG:chromadb.config:Starting component OpenTelemetryClient
DEBUG:chromadb.config:Starting component SimpleAssignmentPolicy
DEBUG:chromadb.config:Starting component SqliteDB
DEBUG:chromadb.config:Starting component LocalSegmentManager
DEBUG:chromadb.config:Starting component SegmentAPI


In [None]:
# id, document, embedding, metadata 

In [31]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-base-en')
instruction = "Represent this sentence for searching relevant passages: "

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-base-en
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


In [32]:
embbeding_function = lambda content: model.encode(q+content).tolist()

In [33]:
# create collection
chroma_collection = db.get_or_create_collection("llama_index", embedding_function=embbeding_function)

DEBUG:urllib3.connectionpool:Resetting dropped connection: app.posthog.com
DEBUG:urllib3.connectionpool:https://app.posthog.com:443 "POST /batch/ HTTP/1.1" 200 None


In [34]:
chroma_collection.count()

0

In [35]:
from llama_index import JSONReader
reader = JSONReader()
data = reader.load_data('../data/llama_index_doc/nonexample_content.json')


documents = data

In [36]:
type(data), type(data[0])

(list, llama_index.schema.Document)

In [37]:
# import markdown textsplitter from langchain
from langchain.text_splitter import MarkdownTextSplitter
markdown_splitter = MarkdownTextSplitter.from_tiktoken_encoder(chunk_size=512)

In [38]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser


node_parser = SimpleNodeParser.from_defaults(text_splitter=markdown_splitter)
service_context = ServiceContext.from_defaults(node_parser=node_parser)

In [39]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [40]:
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

DEBUG:llama_index.node_parser.node_utils:> Adding chunk: "https://docs.llamaindex.ai/en/stable/index.htm...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: This could be LangChain, Flask, Docker, ChatGPT...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Use the environment variable “LLAMA_INDEX_CACHE...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: If you prefer JavaScript, we recommend\ntrying ...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: You will see\nreferences to RAG frequently in t...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Your retrieval strategy is key to the relevancy...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: If you run into terms you don’t\nrecognize, che...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: You can configure the retriever and query engin...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: You can learn more about retrievers\n</module_g...
DEBUG:llama_index.node_parse

DEBUG:urllib3.connectionpool:https://app.posthog.com:443 "POST /batch/ HTTP/1.1" 200 None


In [41]:
chroma_collection.count()

1272

In [66]:
answer_enginer = index.as_query_engine().query("what are key concepts in llama index?")

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": ["what are key concepts in llama index?"], "model": "text-embedding-ada-002", "encoding_format": "base64"}' message='Post details'
DEBUG:urllib3.connectionpool:https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=17 request_id=fb79807c6c6d30e3a77ebec6f31231dc response_code=200
DEBUG:llama_index.vector_stores.chroma:> Top 1 nodes:
DEBUG:llama_index.vector_stores.chroma:> [Node 01fa0ea8-e496-4fdd-856c-d971b72be69b] [Similarity score: 0.7287397870775398] )\n\nOnce you’re up and running, [High-Level\nConcepts](./getting_started/concepts.html) has an o...
DEBUG:llama_index.vector_stores.chroma:> [Node f2180762-5f7a-494f-bc11-a9c3ae784a70] [Similarity score: 0.7174699479240734] You’ll learn about:\n\n  * **[Using LLMs](using_llms/using_ll

In [67]:
answer_enginer.response

'The key concepts in LlamaIndex include using LLMs, loading data from various sources, indexing data, storing data in indexed form, querying data, putting all the components together, tracing and debugging, and evaluating the performance of the application.'

In [68]:
len(answer_enginer.source_nodes)

2

In [69]:
from pprint import pprint

In [70]:
print(answer_enginer.source_nodes[0].get_text())

)\n\nOnce you’re up and running, [High-Level\nConcepts](./getting_started/concepts.html) has an overview of LlamaIndex’s\nmodular architecture. For more hands-on practical examples, look through our\n[End-to-End Tutorials](./end_to_end_tutorials/use_cases.html) or learn how to\n[customize](./getting_started/customization.html) components to fit your\nspecific needs.\n\n**NOTE** : We have a Typescript package too! [Repo](https://github.com/run-\nllama/LlamaIndexTS), [Docs](https://ts.llamaindex.ai/)\n\n## 🗺️ Ecosystem\n\nTo download or contribute, find LlamaIndex on:\n\n  * Github: <https://github.com/jerryjliu/llama_index>\n\n  * PyPi:\n\n    * LlamaIndex: <https://pypi.org/project/llama-index/>.\n\n    * GPT Index (duplicate): <https://pypi.org/project/gpt-index/>.\n\n  * NPM (Typescript/Javascript):\n    \n    * Github: <https://github.com/run-llama/LlamaIndexTS>\n\n    * Docs: <https://ts.llamaindex.ai/>\n\n    * LlamaIndex.TS: <https://www.npmjs.com/package/llamaindex>\n\n### Comm

In [71]:
pprint(answer_enginer.source_nodes[1].get_text())

('You’ll learn about:\\n\\n  * **[Using LLMs](using_llms/using_llms.html)** : '
 'whether it’s OpenAI or any number of hosted LLMs or a locally-run model of '
 'your own, LLMs are used at every step of the way, from indexing and storing '
 'to querying and parsing your data. LlamaIndex comes with a huge number of '
 'reliable, tested prompts and we’ll also show you how to customize your '
 'own.\\n\\n  * **[Loading](loading/loading.html)** : getting your data from '
 'wherever it lives, whether that’s unstructured text, PDFs, databases, or '
 'APIs to other applications. LlamaIndex has hundreds of connectors to every '
 'data source over at [LlamaHub](https://llamahub.ai/).\\n\\n  * '
 '**[Indexing](indexing/indexing.html)** : once you’ve got your data there are '
 'an infinite number of ways to structure access to that data to ensure your '
 'applications is always working with the most relevant data. LlamaIndex has a '
 'huge number of these strategies built-in and can help you selec