In [None]:
from langchain.document_loaders.web_base import WebBaseLoader

In [None]:
loader = WebBaseLoader(web_path="https://documentation.researchspace.com")
loader.load()

In [4]:
import requests
from bs4 import BeautifulSoup
from typing import List, Set
 
base_url = 'https://documentation.researchspace.com'

def get_links_from(path:str)->List[str]:
    url = f"{base_url}{path}"
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    return soup.find_all('a')

def sort_pages(links, cat_urls, articles_urls, art_seen: Set, cat_seen: Set):
    for link in links:
        url = link.get('href')
        if url.startswith("/article") and  url not in art_seen:
            articles_urls.append(url)
            art_seen.add(url)
        if url.startswith("/category") and url not in cat_seen:
            cat_urls.append(url)
            cat_seen.add(url)

links = get_links_from("/")
articles_urls = []
cat_urls = []
art_seen =set()
cat_seen =set()
sort_pages(links, cat_urls, articles_urls, art_seen, cat_seen)
print (cat_urls)

found_articles = 0
for link in cat_urls:
    print(f"following {link}. Current article count is {found_articles}")
    links = get_links_from(link)
    sort_pages(links, cat_urls, articles_urls, art_seen, cat_seen)
    if len(articles_urls) > found_articles:
        print (f"found {len(articles_urls) - found_articles} new articles, continuing")
        found_articles = len(articles_urls)
print (articles_urls)
    

['/category/24blgg0ojn-for-researchers', '/category/2nmldxv391-for-admins', '/category/d7noxvp16k-for-pis', '/category/ifpi5pwbck-for-developers', '/category/l69h6lsuk4-faqs', '/category/m5118puefw-videos', '/category/01era8o0gc-archive', '/category/4yye22t49h-for-technical-staff', '/category/zpizk20kgx-inventory']
following /category/24blgg0ojn-for-researchers. Current article count is 0
found 8 new articles, continuing
following /category/2nmldxv391-for-admins. Current article count is 8
found 13 new articles, continuing
following /category/d7noxvp16k-for-pis. Current article count is 21
found 2 new articles, continuing
following /category/ifpi5pwbck-for-developers. Current article count is 23
found 7 new articles, continuing
following /category/l69h6lsuk4-faqs. Current article count is 30
found 13 new articles, continuing
following /category/m5118puefw-videos. Current article count is 43
following /category/01era8o0gc-archive. Current article count is 43
found 1 new articles, contin

In [6]:
full_urls = [f"{base_url}{url}" for url in articles_urls]
len(full_urls)

280

In [7]:
loader = WebBaseLoader(web_paths=full_urls)
docs = loader.load()

In [8]:
docs[0]

Document(page_content='\n\n\nGetting Started with RSpace ELN - RSpace Help\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                        Contact us\n                      \n\n\n\n                        RSpace Website\n                      \n\n\nContact\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                    All Categories\n                  \n                  \u200b > \u200b\n                    \n\n\u200bFor Researchers\n\n\n                  \u200b > \u200b\n                    \xa0\n                  \xa0\xa0Getting Started with RSpace ELN\n                \n\n\n\n\n\n\nGetting Started with RSpace ELN\n\n\n\n\n                            Updated 4 weeks ago\n                          \n\n                            by\n                            Vaida\n                          \n\n\nThis document provides guidance on help articles relevant for getting started, and where to access help resources. I

In [9]:
import pickle
with open('notebooks.pickle', 'wb') as writer:
    pickle.dump(docs,writer)


In [11]:
with open('helptext.txt', 'a') as writer:
    for d in docs:
        writer.writelines(d.page_content.replace("\n", ""))
        writer.writelines("\n-------------\n")
    

In [None]:
x = sum([len(doc.page_content) for doc in docs])

In [None]:
docs[56].page_content

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
filtered = filter_complex_metadata(docs)

r = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
all_chunks = r.split_documents(filtered)
all_chunks[0]
len(all_chunks)


In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

In [None]:
v_store = Chroma.from_documents(all_chunks, embedding=OpenAIEmbeddings(), persist_directory="rspace-helpdocs")

In [None]:
query ="how do i connect inventory samples to eln"
search_results=v_store.similarity_search(query )
search_results[0]

In [None]:
v_store2 = Chroma(persist_directory="rspace-helpdocs")

In [None]:
v_store2._collection.count()

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [None]:
template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
chain = load_qa_chain(
    OpenAI(temperature=0), chain_type="stuff", memory=memory, prompt=prompt
)

In [None]:

chain({"input_documents": search_results, "human_input": query}, return_only_outputs=True)

In [None]:
query="can you rephrase that"

In [None]:
chain({"input_documents": search_results, "human_input": query}, return_only_outputs=True)

In [None]:
memory