# Search Assitant

In [11]:
import os
import re
import sys
import yaml
from pprint import pprint

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

import requests
import json
from dotenv import load_dotenv
from serpapi import GoogleSearch

from langchain.prompts import PromptTemplate, load_prompt
from langchain.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from urllib.parse import urljoin, urlparse, urldefrag

from utils.sambanova_endpoint import SambaNovaEndpoint, SambaverseEndpoint

load_dotenv("../../.env")

from langchain.globals import set_debug

set_debug(False)

## Define the LLM

In [30]:
#sambeverse llm
llm = SambaverseEndpoint(
            sambaverse_model_name="Meta/llama-2-70b-chat-hf",
            model_kwargs={
                "do_sample": False, 
                "max_tokens_to_generate": 500,
                "temperature": 0.01,
                "top_p": 1,
                "process_prompt": True,
                "select_expert": "llama-2-70b-chat-hf"
            }
        )

#sambastudio llm
#llm = SambaNovaEndpoint(
#    model_kwargs={"do_sample": False, "temperature": 0.0},
#)

## Search tools

In [132]:
# Only admits Google Search
def querySerper(query: str, limit: int = 5, do_analysis: bool = True ,include_site_links: bool = False):
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "num": limit
    })
    headers = {
        'X-API-KEY': os.environ.get("SERPER_API_KEY"),
        'Content-Type': 'application/json'
    }

    response = requests.post(url, headers=headers, data=payload).json()
    results=response["organic"]
    links = [r["link"] for r in results]
    if include_site_links:
        sitelinks = []
        for r in [r.get("sitelinks",[]) for r in results]:
            sitelinks.extend([site.get("link", None) for site in r])
        links.extend(sitelinks)
    links=list(filter(lambda x: x is not None, links))
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerperSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
   

In [65]:
querySerper("who is the president of America", do_analysis=True)

(" Based on the context provided, the answer to the user's question is:\n\nThe President of America is Joe Biden.",
 ['https://www.whitehouse.gov/about-the-white-house/presidents/',
  'https://www.whitehouse.gov/administration/president-biden/',
  'https://www.instagram.com/potus/?hl=en',
  'https://www.facebook.com/POTUS/',
  'https://www.usa.gov/presidents',
  'https://www.whitehouse.gov/administration/president-biden/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/george-washington/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/george-w-bush/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/abraham-lincoln/',
  'https://www.instagram.com/potus/reel/C4YZoUWOkgw/',
  'https://www.instagram.com/potus/reel/Czm47Afxj2k/',
  'https://www.instagram.com/potus/p/C3_hr8PrzqS/',
  'https://www.instagram.com/potus/p/C4GkkSBOKFM/'])

In [76]:
def queryOpenSerp(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    if engine not in ["google","yandex","baidu"]:
        raise ValueError("engine must be either google, yandex or baidu")
    url = f"http://127.0.0.1:7000/{engine}/search"
    params = {
        "lang": "EN",
        "limit": limit,
        "text": query
    }

    results = requests.get(url, params=params).json()
    
    links = [r["url"] for r in results]
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-OpenSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return results, links

In [133]:
queryOpenSerp("who is the president of America", do_analysis=True, engine="google")

(' The President of the United States is Joe Biden. He has been in office since January 20, 2021.',
 ['https://en.wikipedia.org/wiki/President_of_the_United_States',
  'https://www.whitehouse.gov/about-the-white-house/presidents/',
  'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States',
  'https://www.whitehouse.gov/administration/president-biden/'])

In [19]:
def remove_links(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def querySerpapi(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    if engine not in ["google", "bing"]:
        raise ValueError("engine must be either google or bing")
    params = {
        "q": query,
        "num": limit,
        "engine":engine,
        "api_key": os.environ.get("SERPAPI_API_KEY")
        }

    search = GoogleSearch(params)
    response= search.get_dict()
    
    knowledge_graph = response.get("knowledge_graph", None)
    results =  response.get("organic_results",None)

    links = []
    links = [r["link"] for r in results]
    
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerpapiSearchAnalysis.yaml"))
        if knowledge_graph:
            knowledge_graph_str = json.dumps(knowledge_graph)
            #knowledge_graph = remove_links(knowledge_graph_str)
            print(knowledge_graph)
            formatted_prompt = prompt.format(question=query, context=json.dumps(knowledge_graph))
        else:
            results_str = json.dumps(results)
            results_str = remove_links(results_str)
            formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
    

In [135]:
pprint(querySerpapi("Who is the president of USA", engine="bing"))

{'type': 'President of the United States', 'header_images': [{'image': 'https://serpapi.com/searches/661695eca3f4ef8a0ffe397c/images/91ad5e92700695dbc94046ed713a5ffb60510981acb9f9524169d31c6274fd4e8ad7a7acdce0a33a.jpeg', 'source': 'https://www.bing.com/images/search?q=joe+biden&cbn=KnowledgeCard&stid=cad484f9-be75-7a78-12dd-16233f823cd7&thid=OSK.HEROCLICKTHROUGHtghu3TmOsKIP8BgOKF9h6gt-VTffvJ1n7xD2jmHV8zs&FORM=KCHIMM'}], 'thumbnails': [{'image': 'https://serpapi.com/searches/661695eca3f4ef8a0ffe397c/images/91ad5e92700695dbc94046ed713a5ffbc0dccc06f43b9b65a22cad543e7c3adcc4776eba038e3889.jpeg', 'source': 'https://www.bing.com/images/search?q=joe+biden&cbn=KnowledgeCard&stid=cad484f9-be75-7a78-12dd-16233f823cd7&thid=OSK.HEROCLICKTHROUGHtghu3TmOsKIP8BgOKF9h6gt-VTffvJ1n7xD2jmHV8zs&FORM=KCHIMM'}], 'title': 'Joe Biden', 'description': 'Joseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously

In [131]:
pprint(querySerpapi("Who is the president of USA", engine="google"))

{"title": "Joe Biden", "type": "46th U.S. President", "entity_type": "people, athlete, people", "kgmid": "/m/012gx2", "knowledge_graph_search_link": " Biden", "serpapi_knowledge_graph_search_link": " "tabs": [{"text": "All"}, {"text": "Images", "link": " "serpapi_link": " "website": " "description": "Joseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.", "source": {"name": "Wikipedia", "link": " "born": "November 20, 1942 (age 81 years), Scranton, PA", "born_links": [{"text": "Scranton, PA", "link": " "edited_works": "Dirty Bombs and Basement Nukes - The Terrorist Nuclear Threat - Congressional Hearing", "edited_works_links": [{"text": "Dirty Bombs and Basement Nukes: The Terrorist Nuclear Threat - Congressional Hearing", "link":

# Web scrapaing methods

In [12]:
CONFIG_PATH = os.path.join(kit_dir,"config.yaml")

In [13]:
def get_config_info():
    """
    Loads json config file
    """
    # Read config file
    with open(CONFIG_PATH, 'r') as yaml_file:
        config = yaml.safe_load(yaml_file)
    api_info = config["api"]
    llm_info =  config["llm"]
    retrieval_info = config["retrieval"]
    web_crawling_params = config["web_crawling"]
    extra_loaders = config["extra_loaders"]
    
    
    return api_info, llm_info, retrieval_info, web_crawling_params, extra_loaders

In [14]:
def load_remote_pdf(url):
    """
    Load PDF files from the given URL.
    Args:
        url (str): URL to load pdf document from.
    Returns:
        list: A list of loaded pdf documents.
    """
    loader = UnstructuredURLLoader(urls=[url])
    docs = loader.load()
    return docs

In [15]:
def load_htmls(urls, extra_loaders=None):
    """
    Load HTML documents from the given URLs.
    Args:
        urls (list): A list of URLs to load HTML documents from.
    Returns:
        list: A list of loaded HTML documents.
    """
    if extra_loaders is None:
        extra_loaders = []
    docs=[]
    for url in urls:
        if url.endswith(".pdf"):
            if "pdf" in extra_loaders:
                docs.extend(load_remote_pdf(url))
            else:
                continue
        else:
            loader = AsyncHtmlLoader(url, verify_ssl=False)
            docs.extend(loader.load())
    return docs

In [16]:
def link_filter(all_links, excluded_links):
    """
    Filters a list of links based on a list of excluded links.
    Args:
        all_links (List[str]): A list of links to filter.
        excluded_links (List[str]): A list of excluded links.
    Returns:
        Set[str]: A list of filtered links.
    """
    clean_excluded_links=set()
    for excluded_link in excluded_links:
        parsed_link=urlparse(excluded_link)
        clean_excluded_links.add(parsed_link.netloc + parsed_link.path)
    filtered_links = set()
    for link in all_links:
        # Check if the link contains any of the excluded links
        if not any(excluded_link in link for excluded_link in clean_excluded_links):
            filtered_links.add(link)
    return filtered_links

In [17]:
def clean_docs(docs):
    """
    Clean the given HTML documents by transforming them into plain text.
    Args:
        docs (list): A list of langchain documents with html content to clean.
    Returns:
        list: A list of cleaned plain text documents.
    """
    html2text_transformer = Html2TextTransformer()
    docs=html2text_transformer.transform_documents(documents=docs)
    return docs

In [21]:
def web_crawl(urls, excluded_links=None):
    """
    Perform web crawling, retrieve and clean HTML documents from the given URLs, with specified depth of exploration.
    Args:
        urls (list): A list of URLs to crawl.
        excluded_links (list, optional): A list of links to exclude from crawling. Defaults to None.
        depth (int, optional): The depth of crawling, determining how many layers of internal links to explore. Defaults to 1
    Returns:
        tuple: A tuple containing the langchain documents (list) and the scrapped URLs (list).
    """
    *_, web_crawling_params, extra_loaders = get_config_info()
    if excluded_links is None:
        excluded_links = []
    excluded_links.extend(["facebook.com", "twitter.com", "instagram.com", "linkedin.com", "telagram.me", "reddit.com", "whatsapp.com", "wa.me"])
    excluded_link_suffixes = {".ico", ".svg", ".jpg", ".png", ".jpeg", ".", ".docx", ".xls", ".xlsx"}
    scrapped_urls=[]
    
    urls = [url for url in urls if not url.endswith(tuple(excluded_link_suffixes))]
    urls = link_filter(urls, set(excluded_links)) 
    urls = list(urls)[:web_crawling_params["max_scraped_websites"]]   
        
    scraped_docs = load_htmls(urls, extra_loaders)
    scrapped_urls.append(urls)
        
    docs=clean_docs(scraped_docs)
    return docs, scrapped_urls

In [23]:
_,links=querySerpapi("Who is the president of USA", engine="google", do_analysis=False)
web_crawl(links)

Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.17s/it]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.30it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.96it/s]


([Document(page_content='We’re sorry, this site is currently experiencing technical difficulties.  \nPlease try again in a few moments. Exception: request blocked\n\n', metadata={'source': 'https://usun.usmission.gov/our-leaders/the-president-of-the-united-states/', 'title': 'Technical Difficulties', 'language': 'en'}),
  Document(page_content='Skip to content\n\nThe White House\n\nThe White House\n\nThe White House\n\n  * Home \n\n  * Administration\n  * Priorities\n  * The Record\n  * Briefing Room\n  * Español\n\n  * InstagramOpens in a new window\n  * FacebookOpens in a new window\n  * XOpens in a new window\n  * YouTubeOpens in a new window\n\n  * Contact Us\n  * Privacy Policy\n  * Copyright Policy\n  * Accessibility Statement\n\nMenu Close\n\nTo search this site, enter a search term Search\n\n## Mobile Menu Overlay\n\n  * Administration Show submenu for "Administration"”\n    * President Joe Biden\n    * Vice President Kamala Harris\n    * First Lady Dr. Jill Biden\n    * Second