# Search Assitant

In [None]:
import os
import re
import sys
import yaml
from pprint import pprint

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

import requests
import json
from dotenv import load_dotenv
from serpapi import GoogleSearch

from langchain.prompts import PromptTemplate, load_prompt
from langchain.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from urllib.parse import urljoin, urlparse, urldefrag

from utils.sambanova_endpoint import SambaNovaEndpoint, SambaverseEndpoint

load_dotenv("../../.env")

from langchain.globals import set_debug

set_debug(False)

## Define the LLM

In [None]:
#sambeverse llm
llm = SambaverseEndpoint(
            sambaverse_model_name="Meta/llama-2-70b-chat-hf",
            model_kwargs={
                "do_sample": False, 
                "max_tokens_to_generate": 500,
                "temperature": 0.01,
                "top_p": 1,
                "process_prompt": True,
                "select_expert": "llama-2-70b-chat-hf"
            }
        )

#sambastudio llm
#llm = SambaNovaEndpoint(
#    model_kwargs={"do_sample": False, "temperature": 0.0},
#)

## Search tools

In [None]:
# Only admits Google Search
def querySerper(query: str, limit: int = 5, do_analysis: bool = True ,include_site_links: bool = False):
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "num": limit
    })
    headers = {
        'X-API-KEY': os.environ.get("SERPER_API_KEY"),
        'Content-Type': 'application/json'
    }

    response = requests.post(url, headers=headers, data=payload).json()
    results=response["organic"]
    links = [r["link"] for r in results]
    if include_site_links:
        sitelinks = []
        for r in [r.get("sitelinks",[]) for r in results]:
            sitelinks.extend([site.get("link", None) for site in r])
        links.extend(sitelinks)
    links=list(filter(lambda x: x is not None, links))
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerperSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
   

In [None]:
querySerper("who is the president of America", do_analysis=True)

In [None]:
def queryOpenSerp(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    if engine not in ["google","yandex","baidu"]:
        raise ValueError("engine must be either google, yandex or baidu")
    url = f"http://127.0.0.1:7000/{engine}/search"
    params = {
        "lang": "EN",
        "limit": limit,
        "text": query
    }

    results = requests.get(url, params=params).json()
    
    links = [r["url"] for r in results]
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-OpenSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return results, links

In [None]:
queryOpenSerp("who is the president of America", do_analysis=True, engine="google")

In [None]:
def remove_links(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def querySerpapi(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    if engine not in ["google", "bing"]:
        raise ValueError("engine must be either google or bing")
    params = {
        "q": query,
        "num": limit,
        "engine":engine,
        "api_key": os.environ.get("SERPAPI_API_KEY")
        }

    search = GoogleSearch(params)
    response= search.get_dict()
    
    knowledge_graph = response.get("knowledge_graph", None)
    results =  response.get("organic_results",None)

    links = []
    links = [r["link"] for r in results]
    
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerpapiSearchAnalysis.yaml"))
        if knowledge_graph:
            knowledge_graph_str = json.dumps(knowledge_graph)
            knowledge_graph = remove_links(knowledge_graph_str)
            print(knowledge_graph)
            formatted_prompt = prompt.format(question=query, context=json.dumps(knowledge_graph))
        else:
            results_str = json.dumps(results)
            results_str = remove_links(results_str)
            formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
    

In [None]:
pprint(querySerpapi("Who is the president of USA", engine="bing"))

In [None]:
pprint(querySerpapi("Who is the president of USA", engine="google"))

# Web scrapaing methods

In [None]:
CONFIG_PATH = os.path.join(kit_dir,"config.yaml")

In [None]:
def get_config_info():
    """
    Loads json config file
    """
    # Read config file
    with open(CONFIG_PATH, 'r') as yaml_file:
        config = yaml.safe_load(yaml_file)
    api_info = config["api"]
    llm_info =  config["llm"]
    retrieval_info = config["retrieval"]
    web_crawling_params = config["web_crawling"]
    extra_loaders = config["extra_loaders"]
    
    
    return api_info, llm_info, retrieval_info, web_crawling_params, extra_loaders

In [None]:
def load_remote_pdf(url):
    """
    Load PDF files from the given URL.
    Args:
        url (str): URL to load pdf document from.
    Returns:
        list: A list of loaded pdf documents.
    """
    loader = UnstructuredURLLoader(urls=[url])
    docs = loader.load()
    return docs

In [None]:
def load_htmls(urls, extra_loaders=None):
    """
    Load HTML documents from the given URLs.
    Args:
        urls (list): A list of URLs to load HTML documents from.
    Returns:
        list: A list of loaded HTML documents.
    """
    if extra_loaders is None:
        extra_loaders = []
    docs=[]
    for url in urls:
        if url.endswith(".pdf"):
            if "pdf" in extra_loaders:
                docs.extend(load_remote_pdf(url))
            else:
                continue
        else:
            loader = AsyncHtmlLoader(url, verify_ssl=False)
            docs.extend(loader.load())
    return docs

In [None]:
def link_filter(all_links, excluded_links):
    """
    Filters a list of links based on a list of excluded links.
    Args:
        all_links (List[str]): A list of links to filter.
        excluded_links (List[str]): A list of excluded links.
    Returns:
        Set[str]: A list of filtered links.
    """
    clean_excluded_links=set()
    for excluded_link in excluded_links:
        parsed_link=urlparse(excluded_link)
        clean_excluded_links.add(parsed_link.netloc + parsed_link.path)
    filtered_links = set()
    for link in all_links:
        # Check if the link contains any of the excluded links
        if not any(excluded_link in link for excluded_link in clean_excluded_links):
            filtered_links.add(link)
    return filtered_links

In [None]:
def clean_docs(docs):
    """
    Clean the given HTML documents by transforming them into plain text.
    Args:
        docs (list): A list of langchain documents with html content to clean.
    Returns:
        list: A list of cleaned plain text documents.
    """
    html2text_transformer = Html2TextTransformer()
    docs=html2text_transformer.transform_documents(documents=docs)
    return docs

In [None]:
def web_crawl(urls, excluded_links=None):
    """
    Perform web crawling, retrieve and clean HTML documents from the given URLs, with specified depth of exploration.
    Args:
        urls (list): A list of URLs to crawl.
        excluded_links (list, optional): A list of links to exclude from crawling. Defaults to None.
        depth (int, optional): The depth of crawling, determining how many layers of internal links to explore. Defaults to 1
    Returns:
        tuple: A tuple containing the langchain documents (list) and the scrapped URLs (list).
    """
    *_, web_crawling_params, extra_loaders = get_config_info()
    if excluded_links is None:
        excluded_links = []
    excluded_links.extend(["facebook.com", "twitter.com", "instagram.com", "linkedin.com", "telagram.me", "reddit.com", "whatsapp.com", "wa.me"])
    excluded_link_suffixes = {".ico", ".svg", ".jpg", ".png", ".jpeg", ".", ".docx", ".xls", ".xlsx"}
    scrapped_urls=[]
    
    urls = [url for url in urls if not url.endswith(tuple(excluded_link_suffixes))]
    urls = link_filter(urls, set(excluded_links)) 
    urls = list(urls)[:web_crawling_params["max_scraped_websites"]]   
        
    scraped_docs = load_htmls(urls, extra_loaders)
    scrapped_urls.append(urls)
        
    docs=clean_docs(scraped_docs)
    return docs, scrapped_urls

In [None]:
_,links=querySerpapi("Who is the president of USA", engine="google", do_analysis=False)
docs, links = web_crawl(links)

# retrieval and vdb creation

In [None]:
from vectordb.vector_db import VectorDb
vectordb=VectorDb()
config={"persist_directory":"NoneDirectory"}
documents =  docs
def create_load_vector_store(force_reload: bool = False, update: bool = False):
        
        *_, retrieval_info, _, _ = get_config_info()
        
        persist_directory = config.get("persist_directory", "NoneDirectory")
        
        embeddings = vectordb.load_embedding_model()
        
        if os.path.exists(persist_directory) and not force_reload and not update:
            vector_store = vectordb.load_vdb(persist_directory, embeddings, db_type = retrieval_info["db_type"])
        
        elif os.path.exists(persist_directory) and update:
            chunks = vectordb.get_text_chunks(documents , retrieval_info["chunk_size"], retrieval_info["chunk_overlap"])
            vector_store = vectordb.load_vdb(persist_directory, embeddings, db_type = retrieval_info["db_type"])
            vector_store = vectordb.update_vdb(chunks, embeddings, retrieval_info["db_type"], persist_directory)
            
        else:
            chunks = vectordb.get_text_chunks(documents , retrieval_info["chunk_size"], retrieval_info["chunk_overlap"])
            vector_store = vectordb.create_vector_store(chunks, embeddings, retrieval_info["db_type"], None)
            
        return vector_store
    
def create_and_save_local(self, input_directory, persist_directory, update=False):
    
    *_, retrieval_info, _, _ = get_config_info()
    
    chunks = vectordb.get_text_chunks(documents , retrieval_info["chunk_size"], retrieval_info["chunk_overlap"])
    embeddings = vectordb.load_embedding_model()
    if update:
        config["update"]=True
        vector_store = vectordb.update_vdb(chunks, embeddings, retrieval_info["db_type"], input_directory, persist_directory)

    else:
        vector_store = vectordb.create_vector_store(chunks, embeddings, retrieval_info["db_type"], persist_directory)
        
    return vector_store

vector_store=create_load_vector_store()

In [None]:
from langchain.chains import RetrievalQA

def retrieval_qa_chain():
    *_, retrieval_info, _, _ = get_config_info()
    prompt = load_prompt(os.path.join(kit_dir,"prompts/llama7b-web_scraped_data_retriever.yaml"))
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"score_threshold": retrieval_info["score_treshold"], "k": retrieval_info["k_retrieved_documents"]},
    )
    qa_chain = RetrievalQA.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        verbose=True,
        input_key="question",
        output_key="answer",
        prompt=prompt
    )
    return qa_chain

In [None]:
chain = retrieval_qa_chain() 

In [None]:
chain.invoke("who is joe biden")