# Web crawling RAG

In [1]:
import os
import sys

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from tqdm.autonotebook import trange
import nest_asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from langchain_community.llms.sambanova import SambaStudio
from langchain_community.document_loaders import UnstructuredURLLoader

from utils.model_wrappers.langchain_llms import SambaNovaCloud

nest_asyncio.apply()

  from tqdm.autonotebook import trange
USER_AGENT environment variable not set, consider setting it to identify your requests.


## Functions

In [2]:
def load_remote_pdf(url):
    """
    Load PDF files from the given URL.
    Args:
        url (str): URL to load pdf document from.
    Returns:
        list: A list of loaded pdf documents.
    """
    loader = UnstructuredURLLoader(urls=[url])
    docs = loader.load()
    return docs


def load_htmls(urls, extra_loaders=None):
    """
    Load HTML documents from the given URLs.
    Args:
        urls (list): A list of URLs to load HTML documents from.
    Returns:
        list: A list of loaded HTML documents.
    """
    if extra_loaders is None:
        extra_loaders = []
    docs = []
    for url in urls:
        if url.endswith('.pdf'):
            if 'pdf' in extra_loaders:
                docs.extend(load_remote_pdf(url))
            else:
                continue
        else:
            loader = AsyncHtmlLoader(url, verify_ssl=False)
            docs.extend(loader.load())
    return docs


def link_filter(all_links, excluded_links):
    """
    Filters a list of links based on a list of excluded links.
    Args:
        all_links (List[str]): A list of links to filter.
        excluded_links (List[str]): A list of excluded links.
    Returns:
        Set[str]: A list of filtered links.
    """
    clean_excluded_links = set()
    for excluded_link in excluded_links:
        parsed_link = urlparse(excluded_link)
        clean_excluded_links.add(parsed_link.netloc + parsed_link.path)
    filtered_links = set()
    for link in all_links:
        # Check if the link contains any of the excluded links
        if not any(excluded_link in link for excluded_link in clean_excluded_links):
            filtered_links.add(link)
    return filtered_links


def find_links(docs, excluded_links=None):
    """
    Find links in the given HTML documents, excluding specified links and not text content links.
    Args:
        docs (list): A list of documents with html content to search for links.
        excluded_links (list, optional): A list of links to exclude from the search. Defaults to None.
    Returns:
        set: A set of unique links found in the HTML documents.
    """
    if excluded_links is None:
        excluded_links = []
    all_links = set()
    excluded_link_suffixes = {'.ico', '.svg', '.jpg', '.png', '.jpeg', '.', '.docx', '.xls', '.xlsx'}
    for doc in docs:
        page_content = doc.page_content
        base_url = doc.metadata['source']
        # excluded_links.append(base_url)
        soup = BeautifulSoup(page_content, 'html.parser')
        # Identify the main content section (customize based on HTML structure)
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
        if main_content:
            links = main_content.find_all('a', href=True)
            for link in links:
                href = link['href']
                # Check if the link is not an anchor link and not in the excluded links or suffixes
                if not href.startswith(('#', 'data:', 'javascript:')) and not any(
                    href.endswith(suffix) for suffix in excluded_link_suffixes
                ):
                    full_url, _ = urldefrag(urljoin(base_url, href))
                    all_links.add(full_url)
    all_links = link_filter(all_links, set(excluded_links))
    return all_links


def clean_docs(docs):
    """
    Clean the given HTML documents by transforming them into plain text.
    Args:
        docs (list): A list of langchain documents with html content to clean.
    Returns:
        list: A list of cleaned plain text documents.
    """
    html2text_transformer = Html2TextTransformer()
    docs = html2text_transformer.transform_documents(documents=docs)
    return docs


def web_crawl(urls, excluded_links=None, depth=1):
    """
    Perform web crawling, retrieve and clean HTML documents from the given URLs, with specified depth of exploration.
    Args:
        urls (list): A list of URLs to crawl.
        excluded_links (list, optional): A list of links to exclude from crawling. Defaults to None.
        depth (int, optional): The depth of crawling, determining how many layers of internal links to explore. Defaults to 1
    Returns:
        tuple: A tuple containing the langchain documents (list) and the scrapped URLs (list).
    """
    if excluded_links is None:
        excluded_links = []
    if depth > 3:
        depth = 3
    scrapped_urls = []
    raw_docs = []
    for _ in range(depth):
        scraped_docs = load_htmls(urls, extra_loaders=['pdf'])
        scrapped_urls.extend(urls)
        urls = find_links(scraped_docs, excluded_links)
        excluded_links.extend(scrapped_urls)
        raw_docs.extend(scraped_docs)
    docs = clean_docs(scraped_docs)
    return docs, scrapped_urls


def get_text_chunks(docs):
    """
    Split the given docuemnts into smaller chunks.
    Args:
        docs (list): The documents to be split into chunks.
    Returns:
        list: A list of documents with text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
    chunks = text_splitter.split_documents(docs)
    return chunks


def get_vectorstore(text_chunks):
    """
    Create and return a Vector Store for a collection of text chunks.
    This function generates a vector store using the FAISS library, which allows efficient similarity search
    over a collection of text chunks by representing them as embeddings.
    Args:
        text_chunks (list of str): A list of text chunks or sentences to be stored and indexed for similarity search.
    Returns:
        FAISSVectorStore: A Vector Store containing the embeddings of the input text chunks, suitable for similarity search operations.
    """
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceInstructEmbeddings(
        model_name='BAAI/bge-large-en',
        embed_instruction='',  # no instruction is needed for candidate passages
        query_instruction='Represent this paragraph for searching relevant passages: ',
        encode_kwargs=encode_kwargs,
    )
    vectorstore = FAISS.from_documents(documents=text_chunks, embedding=embeddings)
    return vectorstore


def get_custom_prompt():
    """
    Generate a custom prompt template for contextual question answering.
    This function creates and returns a custom prompt template that instructs the model on how to answer a question
    based on the provided context. The template includes placeholders for the context and question to be filled in
    when generating prompts.
    Returns:
        PromptTemplate: A custom prompt template for contextual question answering.
    """
    custom_prompt_template = """<s>[INST] <<SYS>>\n"Use the following pieces of context to answer the question at the end. 
        If the answer is not in context for answering, say that you don't know, don't try to make up an answer or provide an answer not extracted from provided context. 
        Cross check if the answer is contained in provided context. If not than say "I do not have information regarding this." 

        context
        {context}
        end of context
        <</SYS>>

        Question: {question}
        Helpful Answer: [/INST]"""

    CUSTOMPROMPT = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
    return CUSTOMPROMPT


def get_retriever_qa(vectorstore):
    """
    Generate a qa_retrieval chain using a language model.
    This function uses a language model, specifically a SambaNovaEndpoint, to generate a qa_retrieval chain
    based on the input vector store of text chunks.
    Args:
        vectorstore (FAISSVectorStore): A Vector Store containing embeddings of text chunks used as context
                                    for generating the conversation chain.
    Returns:
        RetrievalQA: A chain ready for QA without memory
    """

    # SambaNova Cloud LLM
    llm = SambaNovaCloud(
        max_tokens=1200,
        model='llama3-70b',
    )

    # SambaStudio LLM
    # llm = SambaStudio(
    #    model_kwargs={
    #        'max_tokens': 1200,
    #        'model': 'Meta-Llama-3-70B-Instruct',
    #        'process_prompt': False,
    #    },
    # )

    retriever = vectorstore.as_retriever(
        search_type='similarity_score_threshold',
        search_kwargs={'score_threshold': 0.5, 'k': 4},
    )
    retrieval_chain = RetrievalQA.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        input_key='question',
        output_key='answer',
    )
    ## Inject custom prompt
    retrieval_chain.combine_documents_chain.llm_chain.prompt = get_custom_prompt()
    return retrieval_chain

## Scrape sites

In [3]:
filtered_sites = [
    'facebook.com',
    'twitter.com',
    'instagram.com',
    'linkedin.com',
    'telagram.me',
    'reddit.com',
    'whatsapp.com',
    'wa.me',
]
urls = ['https://www.espn.com', 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'https://sambanova.ai/']
docs, urls = web_crawl(urls, excluded_links=filtered_sites, depth=1)

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.04s/it]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.06it/s]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.06s/it]


In [4]:
urls

['https://www.espn.com',
 'https://lilianweng.github.io/posts/2023-06-23-agent/',
 'https://sambanova.ai/']

## Chunk the text

In [5]:
text_chunks = get_text_chunks(docs)
print(len(text_chunks))

93


## Create a vector store 

In [6]:
vectorstore = get_vectorstore(text_chunks)

load INSTRUCTOR_Transformer
max_seq_length  512


## Initialize the language model, and the retrievalQA chain 

In [7]:
load_dotenv(os.path.join(repo_dir, '.env'))
retrieval_chain = get_retriever_qa(vectorstore)

## Ask a question

In [9]:
# user_question = "which are the mars expeditions?"
# user_question = "what it means planning in an llm agent"
# user_question = "wich are the games for today?"
# user_question = "what is the SN40?"
user_question = 'which kinds of memory can an agent have?'
response = retrieval_chain.invoke({'question': user_question})

In [10]:
print(f'Response ={response["answer"]}')

Response =According to the provided context, an agent can have the following kinds of memory:

1. **Short-term memory**: This is equivalent to in-context learning, which is short and finite, restricted by the finite context window length of the Transformer.
2. **Long-term memory**: This is an external vector store that the agent can attend to at query time, accessible via fast retrieval.

Additionally, the context also mentions **Sensory memory**, which is equivalent to learning embedding representations for raw inputs, including text, image, or other modalities. However, this is not explicitly stated as a type of memory that an agent can have, but rather as a rough mapping to human memory.
