In [1]:
!pip install -Uq nest_asyncio langchain openai lxml bs4 supabase tiktoken python-dotenv

In [22]:
from langchain.document_loaders.sitemap import SitemapLoader
# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

sitemap_loader = SitemapLoader(
    web_path="https://stripe.com/sitemap/partition-2.xml",
    filter_urls=["^https:\/\/stripe\.com\/docs"]
)

sitemap_loader.requests_per_second = 2

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

data = sitemap_loader.load_and_split(text_splitter=text_splitter)

Fetching pages: 100%|##########| 1288/1288 [19:22<00:00,  1.11it/s]


In [24]:
len(data)

29997

In [1]:
from supabase.client import Client, create_client
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.supabase import SupabaseVectorStore
import os
from dotenv import load_dotenv

load_dotenv()

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = OpenAIEmbeddings()

In [2]:
# @ INFO: This is just for initial insertion of data

# vector_store = SupabaseVectorStore.from_documents(
#     documents=data[27000:],
#     embedding=embeddings,
#     client=supabase,
#     table_name="documents",
#     query_name="match_documents",
#     chunk_size=100
# )

vector_store = SupabaseVectorStore(
    client=supabase,
    table_name="documents",
    query_name="match_documents",
    embedding=embeddings,
)

In [23]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.retrievers import RePhraseQueryRetriever
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

llm = ChatOpenAI(temperature=0.1)

retriever_from_llm = RePhraseQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), llm=llm
)

# docs = retriever_from_llm.get_relevant_documents("What are the first steps to setup the Stripe SDK in JS?")

qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever_from_llm, memory=memory, return_source_documents=True)


In [25]:
# qa({ "question": "What are the first steps to setup the Stripe SDK in JS?"})

qa({ "question": "What is Stripe?" })

2023-11-06 13:30:02,345:INFO - Re-phrased question: Query for vectorstore: "Stripe"


ReadTimeout: The read operation timed out