In [1]:
from langchain.document_loaders.web_base import WebBaseLoader

In [2]:
loader = WebBaseLoader(web_path="https://documentation.researchspace.com")
loader.load()

[Document(page_content='\n\n\nRSpace Help\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                        Contact us\n                      \n\n\n\n                        RSpace Website\n                      \n\n\nContact\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n              \t\t\t\tTop Articles\n              \t\t\t\n\n\n\n\n\n\n            \t\t\t\t            Getting Started with RSpace ELN\n              \t\t\t\t\t\t\t\t\nThis document provides guidance on help articles relevant for getting started, and where to access help resources. If you need further support, please get in touch through Intercom ("Chat with us" opâ€¦\n\n\n\n\n\n\n\n\n\n            \t\t\t\t            Setting up Single Sign-On Authentication\n              \t\t\t\t\t\t\t\t\nThis article provides detailed documentation on setting up SSO for RSpace. It applies equally to both on-premises and SaaS deployments. RSpace supports two SSO technolog

In [11]:
import requests
from bs4 import BeautifulSoup
from typing import List, Set
 
base_url = 'https://documentation.researchspace.com'

def get_links_from(path:str)->List[str]:
    url = f"{base_url}{path}"
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    return soup.find_all('a')

def sort_pages(links, cat_urls, articles_urls, art_seen: Set, cat_seen: Set):
    for link in links:
        url = link.get('href')
        if url.startswith("/article") and  url not in art_seen:
            articles_urls.append(url)
            art_seen.add(url)
        if url.startswith("/category") and url not in cat_seen:
            cat_urls.append(url)
            cat_seen.add(url)

links = get_links_from("/")
articles_urls = []
cat_urls = []
art_seen =set()
cat_seen =set()
sort_pages(links, cat_urls, articles_urls, art_seen, cat_seen)
print (cat_urls)

found_articles = 0
for link in cat_urls:
    print(f"following {link}. Current article count is {found_articles}")
    links = get_links_from(link)
    sort_pages(links, cat_urls, articles_urls, art_seen, cat_seen)
    if len(articles_urls) > found_articles:
        print (f"found {len(articles_urls) - found_articles} new articles, continuing")
        found_articles = len(articles_urls)
print (articles_urls)
    

['/category/24blgg0ojn-for-researchers', '/category/2nmldxv391-for-admins', '/category/d7noxvp16k-for-pis', '/category/ifpi5pwbck-for-developers', '/category/l69h6lsuk4-faqs', '/category/m5118puefw-videos', '/category/01era8o0gc-archive', '/category/4yye22t49h-for-technical-staff', '/category/zpizk20kgx-inventory']
following /category/24blgg0ojn-for-researchers. Current article count is 0
found 8 new articles, continuing
following /category/2nmldxv391-for-admins. Current article count is 8
found 11 new articles, continuing
following /category/d7noxvp16k-for-pis. Current article count is 19
found 2 new articles, continuing
following /category/ifpi5pwbck-for-developers. Current article count is 21
found 7 new articles, continuing
following /category/l69h6lsuk4-faqs. Current article count is 28
found 14 new articles, continuing
following /category/m5118puefw-videos. Current article count is 42
following /category/01era8o0gc-archive. Current article count is 42
found 1 new articles, contin

In [12]:
full_urls = [f"{base_url}{url}" for url in articles_urls]

In [18]:
loader = WebBaseLoader(web_paths=full_urls)
docs = loader.load()

In [19]:
len(docs)


279

In [20]:
len(set(articles_urls))

279

In [21]:
x = sum([len(doc.page_content) for doc in docs])

In [27]:
docs[56].page_content

'\n\n\nUsing Inventory Lists of Materials in the ELN ðŸ§ª - RSpace Help\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                        Contact us\n                      \n\n\n\n                        RSpace Website\n                      \n\n\nContact\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                    All Categories\n                  \n                  \u200b > \u200b\n                    \n\n\u200bInventory\n\n\n                  \u200b > \u200b\n                    \xa0\n                  \xa0\xa0Using Inventory Lists of Materials in the ELN ðŸ§ª\n                \n\n\n\n\n\n\nUsing Inventory Lists of Materials in the ELN ðŸ§ª\n\n\n\n\n                            Updated 5 months ago\n                          \n\n                            by\n                            Vaida\n                          \n\n\nOverviewAdd a List of Materials to an ELN document fieldAdd Items to a List of Mat

In [32]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
filtered = filter_complex_metadata(docs)

r = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
all_chunks = r.split_documents(filtered)
all_chunks[0]
len(all_chunks)


3448

In [33]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

In [34]:
v_store = Chroma.from_documents(all_chunks, embedding=OpenAIEmbeddings(), persist_directory="rspace-helpdocs")

In [62]:
query ="how do i connect inventory samples to eln"
search_results=v_store.similarity_search(query )
search_results[0]

Document(page_content='show which samples have been used within your ELN, or from the Inventory side, users can quickly determine which experiments a sample has been used in. Efficient mechanisms for bulk import and export and a powerful "API first" design facillitiate exchange of sample information to or from other systems.The design draws on principals of visual cognition to make maximum use of the human ability to', metadata={'description': 'Quick links to relevant help content for getting started with RSpace Inventory, our inventory and sample management system!', 'language': 'en', 'source': 'https://documentation.researchspace.com/article/tffkwcpizj-get-started-with-inventory', 'title': 'Get Started with Inventory ðŸ§ª - RSpace Help'})

In [44]:
v_store2 = Chroma(persist_directory="rspace-helpdocs")

In [45]:
v_store2._collection.count()

3448

In [46]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [51]:
template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
chain = load_qa_chain(
    OpenAI(temperature=0), chain_type="stuff", memory=memory, prompt=prompt
)

In [63]:

chain({"input_documents": search_results, "human_input": query}, return_only_outputs=True)

{'output_text': ' You can connect Inventory samples to ELN documents by adding Lists of Materials to ELN document fields. This allows you to easily associate Inventory items to experiments, as well as update the quantities of items as you use them up from within the ELN.'}

In [58]:
query="can you rephrase that"

In [59]:
chain({"input_documents": search_results, "human_input": query}, return_only_outputs=True)

{'output_text': " To delete tags, click on the 'x' in the tag pill or remove them from the â€˜Tagsâ€™ textbox at the top-right-hand side of the document view."}

In [60]:
memory

ConversationBufferMemory(chat_memory=ChatMessageHistory(messages=[HumanMessage(content='how do i delete tags'), AIMessage(content=" To delete tags, click on the 'x' in the tag pill. You can also click on the 'edit' symbol and remove them as a comma-separated list in the â€˜Tagsâ€™ textbox at the top-right-hand side of the document view."), HumanMessage(content='can you rephrase that'), AIMessage(content=" To delete tags, click on the 'x' in the tag pill or remove them from the â€˜Tagsâ€™ textbox at the top-right-hand side of the document view.")]), input_key='human_input', memory_key='chat_history')