In [13]:
import requests
import os
from glob import glob
from pprint import pprint

from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex, Settings
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.callbacks import CallbackManager

In [19]:
pty_docs_url = "https://docs.python.org/3/library/pty.html"
pty_blog_example_url = "https://allican.be/blog/2017/01/15/python-dummy-serial-port.html"
dlpc9000_docs_dirpath = "../../manuals/dmd"

In [6]:
# read info from pty docs website
pty_docs_response = requests.get(pty_docs_url)
with open(os.path.join(dlpc9000_docs_dirpath, "pty_docs.html"), "w") as f:
    f.write(pty_docs_response.text)

pty_blog_response = requests.get(pty_blog_example_url)
with open(os.path.join(dlpc9000_docs_dirpath, "pty_blog.html"), "w") as f:
    f.write(pty_blog_response.text)


In [26]:
# set up parser
parser = LlamaParse(
    result_type="markdown"  # "markdown" and "text" are available
)

# use SimpleDirectoryReader to parse manuals
dmd_docs = []
for filepath in glob(os.path.join(dlpc9000_docs_dirpath, "*")):
    file_extractor = {".pdf": parser, ".html": parser}
    dmd_docs.append(SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor).load_data())


Started parsing the file under job_id 138159ab-4a0d-4be3-baae-57f265f12b4d
Started parsing the file under job_id 20930fac-5618-43f4-90a5-16066ebc2500
Started parsing the file under job_id d8e062b4-b842-4690-82a1-da983efd31cd
Started parsing the file under job_id 5e9ffadc-453d-4dfb-8ca6-f603a1c53829


In [28]:
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [29]:
# let's test getting a document summary that can be used to seed a QueryEngineTool
# build index for first document
docs_index = VectorStoreIndex.from_documents(dmd_docs[0])
summary = docs_index.as_query_engine().query("Summarize this document so a tool built upon it knows how to use it.")

In [35]:
pprint(summary.response)

('This document explains how to create dummy serial ports in Python using '
 'pseudoterminals. It provides code examples that demonstrate setting up a '
 'master and slave pair using the pty module. The script includes a listener '
 'function that continuously listens for commands on the master device and '
 'responds accordingly. The test_serial function initiates the listener thread '
 'and establishes a pySerial connection to the slave, sending commands and '
 'reading responses. The document serves as a guide on utilizing '
 'pseudoterminals for creating dummy serial connections in Python.')


In [15]:
# build index
docs_index = VectorStoreIndex.from_documents(documents)

In [16]:
# persist index
docs_index.storage_context.persist(persist_dir="./storage/dmd_docs_index")

In [None]:
dmd_docs_engine = docs_index.as_query_engine(similarity_top_k=3)