In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA

In [3]:
from langchain.document_loaders import WebBaseLoader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import AzureOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

url = "https://gist.githubusercontent.com/sozercan/962756396fa3bb200fea00995bbf0738/raw/ca8a28e071916e39b3302d04ffa75b03de5c257f/kube-proxy.json" 

loader = WebBaseLoader(url)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)

texts = text_splitter.split_documents(documents)

llm = AzureOpenAI(temperature=0, deployment_name="text-davinci-003")

# select which embeddings we want to use
embeddings = OpenAIEmbeddings(chunk_size=1)

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1})

chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type="map_reduce", retriever=db.as_retriever())
#reduce_k_below_max_tokens=True, max_tokens_limit=4097

Using embedded DuckDB without persistence: data will be transient


In [6]:
packagesQuery = "list all the installed packages"

chain({"question": packagesQuery}, return_only_outputs=True)

{'answer': ' The installed packages are: libipset13, libkmod2, liblzma5, libmnl0, libnetfilter-conntrack3, libnfnetlink0, libnftnl11, libpcre3, libssl1.1, libxtables12, lsb-base, netbase, openssl, sigs.k8s.io/apiserver-network-proxy/konnectivity-client, sigs.k8s.io/json, sigs.k8s.io/structured-merge-diff/v4, kmod, libc6, libip4tc2, libip6tc2, iptables, k8s.io/klog/v2, k8s.io/kube-openapi, k8s.io/kubernetes, k8s.io/release/images/build/go-runner, k8s.io/utils.\n',
 'sources': 'https://gist.githubusercontent.com/sozercan/962756396fa3bb200fea00995bbf0738/raw/ca8a28'}

In [5]:
licensesQuery = "What are the licenses of packages installed?"

chain({"question": licensesQuery}, return_only_outputs=True)

{'answer': ' The licenses of the packages installed are GPL-2.0-only, GPL-2.0-or-later, GPL-3.0-only, LGPL-2.0-only, LGPL-2.1-only, LGPL-2.1+, FSFAP, FSFUL, FSFULLR, GPL-3.0-or-later, BSD-3-Clause, and NONE.\n',
 'sources': 'https://gist.githubusercontent.com/sozercan/962756396fa3bb200fea00995bbf0738/raw/ca8a28e071916e39b3302d04ffa75b03de5c257f/kube-proxy.json'}

In [7]:
licensesQuery = "What is the golang.org/x/net version?"

chain({"question": licensesQuery}, return_only_outputs=True)

{'answer': ' The golang.org/x/net version is v0.0.0-20180826012351-8a410e7b638d.\n',
 'sources': 'https://gist.githubusercontent.com/sozercan/962756396fa3bb200fea00995bbf0738/raw/ca8a28e071916e39b3302d04ffa75b03de5c257f/kube-proxy.json'}