# Search Assitant

In [123]:
import os
import re
import sys
from pprint import pprint

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

import requests
import json
from dotenv import load_dotenv
from serpapi import GoogleSearch

from langchain.prompts import PromptTemplate, load_prompt
from langchain.pydantic_v1 import BaseModel, Field

from utils.sambanova_endpoint import SambaNovaEndpoint, SambaverseEndpoint

load_dotenv("../../.env")

from langchain.globals import set_debug

set_debug(False)

## Define the LLM

In [30]:
#sambeverse llm
llm = SambaverseEndpoint(
            sambaverse_model_name="Meta/llama-2-70b-chat-hf",
            model_kwargs={
                "do_sample": False, 
                "max_tokens_to_generate": 500,
                "temperature": 0.01,
                "top_p": 1,
                "process_prompt": True,
                "select_expert": "llama-2-70b-chat-hf"
            }
        )

#sambastudio llm
#llm = SambaNovaEndpoint(
#    model_kwargs={"do_sample": False, "temperature": 0.0},
#)

## Search tools

In [132]:
# Only admits Google Search
def querySerper(query: str, limit: int = 5, do_analysis: bool = True ,include_site_links: bool = False):
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "num": limit
    })
    headers = {
        'X-API-KEY': os.environ.get("SERPER_API_KEY"),
        'Content-Type': 'application/json'
    }

    response = requests.post(url, headers=headers, data=payload).json()
    results=response["organic"]
    links = [r["link"] for r in results]
    if include_site_links:
        sitelinks = []
        for r in [r.get("sitelinks",[]) for r in results]:
            sitelinks.extend([site.get("link", None) for site in r])
        links.extend(sitelinks)
    links=list(filter(lambda x: x is not None, links))
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerperSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
   

In [65]:
querySerper("who is the president of America", do_analysis=True)

(" Based on the context provided, the answer to the user's question is:\n\nThe President of America is Joe Biden.",
 ['https://www.whitehouse.gov/about-the-white-house/presidents/',
  'https://www.whitehouse.gov/administration/president-biden/',
  'https://www.instagram.com/potus/?hl=en',
  'https://www.facebook.com/POTUS/',
  'https://www.usa.gov/presidents',
  'https://www.whitehouse.gov/administration/president-biden/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/george-washington/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/george-w-bush/',
  'https://www.whitehouse.gov/about-the-white-house/presidents/abraham-lincoln/',
  'https://www.instagram.com/potus/reel/C4YZoUWOkgw/',
  'https://www.instagram.com/potus/reel/Czm47Afxj2k/',
  'https://www.instagram.com/potus/p/C3_hr8PrzqS/',
  'https://www.instagram.com/potus/p/C4GkkSBOKFM/'])

In [76]:
def queryOpenSerp(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    if engine not in ["google","yandex","baidu"]:
        raise ValueError("engine must be either google, yandex or baidu")
    url = f"http://127.0.0.1:7000/{engine}/search"
    params = {
        "lang": "EN",
        "limit": limit,
        "text": query
    }

    results = requests.get(url, params=params).json()
    
    links = [r["url"] for r in results]
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-OpenSearchAnalysis.yaml"))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return results, links

In [79]:
queryOpenSerp("who is the president of America", do_analysis=True, engine="google")

(' The President of the United States is Joe Biden. He has been in office since January 20, 2021.',
 ['https://en.wikipedia.org/wiki/President_of_the_United_States',
  'https://www.whitehouse.gov/about-the-white-house/presidents/',
  'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States',
  'https://www.instagram.com/potus/?hl=en'])

In [129]:
def remove_links(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def querySerpapi(query: str, limit: int = 5, do_analysis: bool = True, engine="google") -> str:
    if engine not in ["google", "bing"]:
        raise ValueError("engine must be either google or bing")
    params = {
        "q": query,
        "num": limit,
        "engine":engine,
        "api_key": os.environ.get("SERPAPI_API_KEY")
        }

    search = GoogleSearch(params)
    response= search.get_dict()
    
    knowledge_graph = response.get("knowledge_graph", None)
    results =  response.get("organic_results",None)

    links = []
    links = [r["link"] for r in results]
    
    
    if do_analysis:
        prompt = load_prompt(os.path.join(kit_dir, "prompts/llama70b-SerpapiSearchAnalysis.yaml"))
        if knowledge_graph:
            knowledge_graph_str = json.dumps(knowledge_graph)
            knowledge_graph = remove_links(knowledge_graph_str)
            print(knowledge_graph)
            formatted_prompt = prompt.format(question=query, context=json.dumps(knowledge_graph))
        else:
            print(results)
            results_str = json.dumps(results)
            results_str = remove_links(results_str)
            formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links
    

In [130]:
pprint(querySerpapi("Who is the president of USA", engine="bing"))

{"type": "President of the United States", "header_images": [{"image": " "source": " "thumbnails": [{"image": " "source": " "title": "Joe Biden", "description": "Joseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously served as the 47th vice preside\u2026New content will be added above the current area of focus upon selectionJoseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.Wikipediajoebiden.com", "quote": {"title": "Failure at some point in your life is inevitable, but giving up is unforgivable.", "link": " "facts": [{"title": "He served on the Senate\u2019s Foreign Relations Committee, twice as its chair (2001\u201303; 200

In [131]:
pprint(querySerpapi("Who is the president of USA", engine="google"))

{"title": "Joe Biden", "type": "46th U.S. President", "entity_type": "people, athlete, people", "kgmid": "/m/012gx2", "knowledge_graph_search_link": " Biden", "serpapi_knowledge_graph_search_link": " "tabs": [{"text": "All"}, {"text": "Images", "link": " "serpapi_link": " "website": " "description": "Joseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.", "source": {"name": "Wikipedia", "link": " "born": "November 20, 1942 (age 81 years), Scranton, PA", "born_links": [{"text": "Scranton, PA", "link": " "edited_works": "Dirty Bombs and Basement Nukes - The Terrorist Nuclear Threat - Congressional Hearing", "edited_works_links": [{"text": "Dirty Bombs and Basement Nukes: The Terrorist Nuclear Threat - Congressional Hearing", "link":