# Web crawling RAG

In [2]:
import sys
sys.path.append("../")
from tqdm.autonotebook import trange
import nest_asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from models.sambanova_endpoint import SambaNovaEndpoint

nest_asyncio.apply()


## Functions

In [3]:

def load_htmls(urls):
    docs=[]
    for url in urls:
        #print(url)
        loader = AsyncHtmlLoader(url, verify_ssl=False)
        docs.extend(loader.load())
    return docs

def link_filter(all_links, excluded_links):
    clean_excluded_links=set()
    for excluded_link in excluded_links:
        parsed_link=urlparse(excluded_link)
        clean_excluded_links.add(parsed_link.netloc + parsed_link.path)
    filtered_links = set()
    for link in all_links:
        # Check if the link contains any of the excluded links
        if not any(excluded_link in link for excluded_link in clean_excluded_links):
            filtered_links.add(link)
    return filtered_links

def find_links(docs, excluded_links=None):
    if excluded_links is None:
        excluded_links = []
    all_links = set()  
    excluded_link_suffixes = {".ico", ".svg", ".jpg", ".png", ".jpeg", "."}
    
    for doc in docs:
        page_content = doc.page_content
        base_url = doc.metadata["source"]
        excluded_links.append(base_url)
        soup = BeautifulSoup(page_content, 'html.parser')
        # Identify the main content section (customize based on HTML structure)
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
        
        if main_content:
            links = main_content.find_all('a', href=True)
            for link in links:
                href = link['href']
                # Check if the link is not an anchor link and not in the excluded links or suffixes
                if (
                    not href.startswith(('#', 'data:', 'javascript:')) and
                    not any(href.endswith(suffix) for suffix in excluded_link_suffixes)
                ):
                    full_url, _ = urldefrag(urljoin(base_url, href))
                    all_links.add(full_url)
                    
    all_links=link_filter(all_links, set(excluded_links))
    return all_links

def clean_docs(docs):
    html2text_transformer = Html2TextTransformer()
    docs=html2text_transformer.transform_documents(documents=docs)
    return docs

def web_crawl(urls, excluded_links=None, depth = 1):
    if excluded_links == None:
        excluded_links = []
    if depth > 3:
        depth = 3
    scrapped_urls=[]
    raw_docs=[]
    for i in range(depth):
        scraped_docs = load_htmls(urls)
        scrapped_urls.extend(urls)
        urls=find_links(scraped_docs, excluded_links)
        excluded_links.extend(scrapped_urls)
        raw_docs.extend(scraped_docs)
    docs=clean_docs(scraped_docs)
    return docs, scrapped_urls

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, length_function=len
    )
    chunks = text_splitter.split_documents(text)
    return chunks

def get_vectorstore(text_chunks):
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = HuggingFaceInstructEmbeddings(
        model_name="BAAI/bge-large-en",
        embed_instruction="",  # no instruction is needed for candidate passages
        query_instruction="Represent this paragraph for searching relevant passages: ",
        encode_kwargs=encode_kwargs,
    )
    vectorstore = FAISS.from_documents(documents=text_chunks, embedding=embeddings)
    return vectorstore

def get_custom_prompt():
    custom_prompt_template = """<s>[INST] <<SYS>>\n"Use the following pieces of context to answer the question at the end. 
        If the answer is not in context for answering, say that you don't know, don't try to make up an answer or provide an answer not extracted from provided context. 
        Cross check if the answer is contained in provided context. If not than say "I do not have information regarding this." 

        context
        {context}
        end of context
        <</SYS>>

        Question: {question}
        Helpful Answer: [/INST]"""

    CUSTOMPROMPT = PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
    return CUSTOMPROMPT

def get_retriever_qa(vectorstore):
    llm = SambaNovaEndpoint(
        model_kwargs={"do_sample": False, "temperature": 0.0},
    )
    retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5, "k": 4},
    )
    retrieval_chain = RetrievalQA.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        input_key="question",
        output_key="answer",
    )
    ## Inject custom prompt
    retrieval_chain.combine_documents_chain.llm_chain.prompt = get_custom_prompt()
    return retrieval_chain

## Scrape sites

In [10]:
filtered_sites = ["facebook.com", "twitter.com", "instagram.com", "linkedin.com", "telagram.me", "reddit.com", "whatsapp.com", "wa.me"]
urls=["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/", "https://sambanova.ai/"]
docs, urls=web_crawl(urls, excluded_links=filtered_sites, depth=1)

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.77it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.94it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.49it/s]


In [11]:
urls

['https://www.espn.com',
 'https://lilianweng.github.io/posts/2023-06-23-agent/',
 'https://sambanova.ai/']

## Chunk the text

In [12]:
text_chunks = get_text_chunks(docs)
print(len(text_chunks))

97


## Create a vector store 

In [16]:
vectorstore = get_vectorstore(text_chunks)

load INSTRUCTOR_Transformer
max_seq_length  512


## Initialize the language model, and the retrievalQA chain 

In [20]:
load_dotenv("../../web_crawled_data_retriever/export.env")
retrieval_chain=get_retriever_qa(vectorstore)

## Ask a question

In [21]:
#user_question = "which are the mars expeditions?"
#user_question = "what it means planning in an llm agent"
#user_question = "wich are the games for today?"
#user_question = "what is the SN40?"
user_question = "which kind of memory can an agent have?"
response = retrieval_chain({"question": user_question})

  warn_deprecated(


In [22]:
print(f'Response ={response["answer"]}')

Response = According to the provided context, an agent can have the following types of memory:

* Short-term memory: This type of memory is used for in-context learning and is restricted by the finite context window length of the Transformer.
* Long-term memory: This type of memory is provided by an external vector store that the agent can attend to at query time, accessible via fast retrieval.

Therefore, the answer to the question is: An agent can have
