# Search Assitant

In [7]:
import os
import re
import sys
import yaml
from pprint import pprint

import requests
import json
from dotenv import load_dotenv
from serpapi import GoogleSearch

from langchain_classic.prompts import PromptTemplate, load_prompt
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_classic.document_loaders import AsyncHtmlLoader
from langchain_classic.document_transformers import Html2TextTransformer
from langchain_classic.prompts import ChatPromptTemplate
from urllib.parse import urljoin, urlparse, urldefrag
from langchain_classic.embeddings import HuggingFaceInstructEmbeddings

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from langchain_sambanova import ChatSambaNova

load_dotenv(os.path.join(repo_dir, '.env'), override=True)

from langchain_classic.globals import set_debug

set_debug(False)

## Define the LLM

In [8]:
llm = ChatSambaNova(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
    model="gpt-oss-120b"
)

## Search tools

In [9]:
def load_chat_prompt(path: str) -> ChatPromptTemplate:
    """Load chat prompt from yaml file"""

    with open(path, 'r') as file:
        config = yaml.safe_load(file)

    config.pop('_type')

    template = config.pop('template')

    if not template:
        msg = "Can't load chat prompt without template"
        raise ValueError(msg)

    messages = []
    if isinstance(template, str):
        messages.append(('human', template))

    elif isinstance(template, list):
        for item in template:
            messages.append((item['role'], item['content']))

    return ChatPromptTemplate(messages=messages, **config)

In [10]:
# Only admits Google Search
def querySerper(query: str, limit: int = 5, do_analysis: bool = True, include_site_links: bool = False):
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    url = 'https://google.serper.dev/search'
    payload = json.dumps({'q': query, 'num': limit})
    headers = {'X-API-KEY': os.environ.get('SERPER_API_KEY'), 'Content-Type': 'application/json'}

    response = requests.post(url, headers=headers, data=payload).json()
    results = response['organic']
    links = [r['link'] for r in results]
    if include_site_links:
        sitelinks = []
        for r in [r.get('sitelinks', []) for r in results]:
            sitelinks.extend([site.get('link', None) for site in r])
        links.extend(sitelinks)
    links = list(filter(lambda x: x is not None, links))

    if do_analysis:
        prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts/serp_analysis.yaml'))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links

In [11]:
querySerper('who is the president of America', do_analysis=True)

(AIMessage(content='Donald Trump is the current President of the United States (America)【reference:1】.', additional_kwargs={'reasoning_content': 'We need to answer using the provided contexts. The question: "who is the president of America". The contexts include Wikipedia snippet: "Donald Trump is the 47th and current president since January 20, 2025." Also other sources mention Donald J. Trump as president. So answer: Donald Trump is the current President of the United States (America). Cite reference. Use [reference:1] for Wikipedia snippet. Also could cite [reference:2] etc. Provide concise answer.\n\n'}, response_metadata={'token_usage': {'acceptance_rate': None, 'completion_tokens': 128, 'completion_tokens_after_first_per_sec': 530.3008601465435, 'completion_tokens_after_first_per_sec_first_ten': 532.2428300934738, 'completion_tokens_after_first_per_sec_graph': 532.2428300934738, 'completion_tokens_per_sec': 433.99254517396616, 'end_time': 1765325895.6541054, 'is_last_response': T

In [13]:
def queryOpenSerp(query: str, limit: int = 5, do_analysis: bool = True, engine='google') -> str:
    """A search engine. Useful for when you need to answer questions about current events. Input should be a search query."""
    if engine not in ['google', 'yandex', 'baidu']:
        raise ValueError('engine must be either google, yandex or baidu')
    url = f'http://127.0.0.1:7000/{engine}/search'
    params = {'lang': 'EN', 'limit': limit, 'text': query}

    results = requests.get(url, params=params).json()

    links = [r['url'] for r in results]
    if do_analysis:
        prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts/serp_analysis.yaml'))
        formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return results, links

In [None]:
queryOpenSerp('who is the president of America', do_analysis=True, engine='google')

In [16]:
def remove_links(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)


def querySerpapi(query: str, limit: int = 5, do_analysis: bool = True, engine='google') -> str:
    if engine not in ['google', 'bing']:
        raise ValueError('engine must be either google or bing')
    params = {'q': query, 'num': limit, 'engine': engine, 'api_key': os.environ.get('SERPAPI_API_KEY')}

    search = GoogleSearch(params)
    response = search.get_dict()

    knowledge_graph = response.get('knowledge_graph', None)
    results = response.get('organic_results', None)

    links = []
    links = [r['link'] for r in results]

    if do_analysis:
        prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts/serp_analysis.yaml'))
        if knowledge_graph:
            knowledge_graph_str = json.dumps(knowledge_graph)
            knowledge_graph = remove_links(knowledge_graph_str)
            print(knowledge_graph)
            formatted_prompt = prompt.format(question=query, context=json.dumps(knowledge_graph))
        else:
            results_str = json.dumps(results)
            results_str = remove_links(results_str)
            formatted_prompt = prompt.format(question=query, context=json.dumps(results))
        return llm.invoke(formatted_prompt), links
    else:
        return response, links

In [17]:
pprint(querySerpapi('Who is the president of USA', engine='bing'))

(AIMessage(content='Donald\u202fJ.\u202fTrump is the current President of the United States\u202f[reference:4][reference:7]', additional_kwargs={'reasoning_content': 'We need to answer based on provided contexts. Several snippets indicate as of 2025, the President is Donald J. Trump. Cite reference 4 (All About America) says "As of 2025, the President of the United States is Donald J. Trump". Also reference 7,8,9 indicate Trump sworn in as 47th president in Jan 2025. So answer: Donald J. Trump. Cite multiple.'}, response_metadata={'token_usage': {'acceptance_rate': None, 'completion_tokens': 122, 'completion_tokens_after_first_per_sec': 501.7938467044957, 'completion_tokens_after_first_per_sec_first_ten': 503.16893922344093, 'completion_tokens_after_first_per_sec_graph': 503.16893922344093, 'completion_tokens_per_sec': 233.37002577214975, 'end_time': 1765325952.3212404, 'is_last_response': True, 'prompt_tokens': 4218, 'prompt_tokens_details': {'cached_tokens': 0}, 'start_time': 1765325

In [18]:
pprint(querySerpapi('Who is the president of USA', engine='google'))

{"title": "Donald Trump", "type": "45th and 47th U.S. President", "kgmid": "/m/0cqt90", "knowledge_graph_search_link": " "serpapi_knowledge_graph_search_link": " "header_images": [{"image": " "source": " {"image": " "source": " {"image": " "source": " "description": "Donald John Trump is an American politician, media personality, and businessman who is the 47th president of the United States. A member of the Republican Party, he served as the 45th president from 2017 to 2021.", "source": {"name": "\u00a0Wikipedia", "link": " "previous_campaigns": "2024 United States presidential election, United States Presidential election, 2020", "previous_campaigns_links": [{"text": "2024 United States presidential election", "link": " {"text": "United States Presidential election, 2020", "link": " "children": "Barron Trump, Ivanka Trump, Tiffany Trump, Donald Trump Jr., Eric Trump", "children_links": [{"text": "Barron Trump", "link": " {"text": "Ivanka Trump", "link": " {"text": "Tiffany Trump", "l

# Web scrapping methods

In [19]:
CONFIG_PATH = os.path.join(kit_dir, 'config.yaml')

In [27]:
def get_config_info():
    """
    Loads json config file
    """
    # Read config file
    with open(CONFIG_PATH, 'r') as yaml_file:
        config = yaml.safe_load(yaml_file)
    llm_info = config['llm']
    retrieval_info = config['retrieval']
    web_crawling_params = config['web_crawling']
    extra_loaders = config['extra_loaders']

    return llm_info, retrieval_info, web_crawling_params, extra_loaders

In [28]:
def load_remote_pdf(url):
    """
    Load PDF files from the given URL.
    Args:
        url (str): URL to load pdf document from.
    Returns:
        list: A list of loaded pdf documents.
    """
    loader = UnstructuredURLLoader(urls=[url])
    docs = loader.load()
    return docs

In [29]:
def load_htmls(urls, extra_loaders=None):
    """
    Load HTML documents from the given URLs.
    Args:
        urls (list): A list of URLs to load HTML documents from.
    Returns:
        list: A list of loaded HTML documents.
    """
    if extra_loaders is None:
        extra_loaders = []
    docs = []
    for url in urls:
        if url.endswith('.pdf'):
            if 'pdf' in extra_loaders:
                docs.extend(load_remote_pdf(url))
            else:
                continue
        else:
            loader = AsyncHtmlLoader(url, verify_ssl=False)
            docs.extend(loader.load())
    return docs

In [30]:
def link_filter(all_links, excluded_links):
    """
    Filters a list of links based on a list of excluded links.
    Args:
        all_links (List[str]): A list of links to filter.
        excluded_links (List[str]): A list of excluded links.
    Returns:
        Set[str]: A list of filtered links.
    """
    clean_excluded_links = set()
    for excluded_link in excluded_links:
        parsed_link = urlparse(excluded_link)
        clean_excluded_links.add(parsed_link.netloc + parsed_link.path)
    filtered_links = set()
    for link in all_links:
        # Check if the link contains any of the excluded links
        if not any(excluded_link in link for excluded_link in clean_excluded_links):
            filtered_links.add(link)
    return filtered_links

In [31]:
def clean_docs(docs):
    """
    Clean the given HTML documents by transforming them into plain text.
    Args:
        docs (list): A list of langchain documents with html content to clean.
    Returns:
        list: A list of cleaned plain text documents.
    """
    html2text_transformer = Html2TextTransformer()
    docs = html2text_transformer.transform_documents(documents=docs)
    return docs

In [32]:
def web_crawl(urls, excluded_links=None):
    """
    Perform web crawling, retrieve and clean HTML documents from the given URLs, with specified depth of exploration.
    Args:
        urls (list): A list of URLs to crawl.
        excluded_links (list, optional): A list of links to exclude from crawling. Defaults to None.
        depth (int, optional): The depth of crawling, determining how many layers of internal links to explore. Defaults to 1
    Returns:
        tuple: A tuple containing the langchain documents (list) and the scrapped URLs (list).
    """
    *_, web_crawling_params, extra_loaders = get_config_info()
    if excluded_links is None:
        excluded_links = []
    excluded_links.extend(
        [
            'facebook.com',
            'twitter.com',
            'instagram.com',
            'linkedin.com',
            'telagram.me',
            'reddit.com',
            'whatsapp.com',
            'wa.me',
        ]
    )
    excluded_link_suffixes = {'.ico', '.svg', '.jpg', '.png', '.jpeg', '.', '.docx', '.xls', '.xlsx'}
    scrapped_urls = []

    urls = [url for url in urls if not url.endswith(tuple(excluded_link_suffixes))]
    urls = link_filter(urls, set(excluded_links))
    urls = list(urls)[: web_crawling_params['max_scraped_websites']]

    scraped_docs = load_htmls(urls, extra_loaders)
    scrapped_urls.append(urls)

    docs = clean_docs(scraped_docs)
    return docs, scrapped_urls

In [33]:
_, links = querySerpapi('Who is the president of USA', engine='google', do_analysis=False)
docs, links = web_crawl(links)

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.38it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.73it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.25it/s]


# retrieval and vdb creation

In [34]:
from utils.vectordb.vector_db import VectorDb
from langchain_sambanova import SambaNovaEmbeddings

vectordb = VectorDb()
config = {'persist_directory': 'NoneDirectory'}
documents = docs

def create_and_save_local(input_directory=None, persist_directory=None, update=False):
    *_, retrieval_info, _, _ = get_config_info()
    persist_directory = config.get('persist_directory', 'NoneDirectory')

    chunks = vectordb.get_text_chunks(documents, retrieval_info['chunk_size'], retrieval_info['chunk_overlap'])
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = SambaNovaEmbeddings(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
        model="E5-Mistral-7B-Instruct"
    )
    if update and os.path.exists(persist_directory):
        config['update'] = True
        vector_store = vectordb.update_vdb(
            chunks, embeddings, retrieval_info['db_type'], input_directory, persist_directory
        )

    else:
        if os.path.exists(persist_directory):
            vector_store = vectordb.create_vector_store(
                chunks, embeddings, retrieval_info['db_type'], persist_directory
            )
        else:
            vector_store = vectordb.create_vector_store(chunks, embeddings, retrieval_info['db_type'], None)
    
    return vector_store


vector_store = create_and_save_local()

2025-12-09 19:22:03,587 [INFO] - Splitter: splitting documents
2025-12-09 19:22:03,590 [INFO] - Total 184 chunks created
2025-12-09 19:22:03,601 [INFO] - This is the collection name: collection_909d57c4-0e60-4c1d-9cb4-def11797d319
2025-12-09 19:22:03,618 [INFO] - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-12-09 19:22:05,718 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:08,909 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:14,137 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:17,517 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:21,098 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:23,780 [INFO] - HTTP Request: POST https://api.sambanova.ai/v

In [35]:
from langchain_classic.chains import RetrievalQA


def retrieval_qa_chain():
    *_, retrieval_info, _, _ = get_config_info()
    prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts/web_scraped_data_retriever.yaml'))
    retriever = vector_store.as_retriever(
        search_type='similarity_score_threshold',
        search_kwargs={
            'score_threshold': retrieval_info['score_treshold'],
            'k': retrieval_info['k_retrieved_documents'],
        },
    )
    qa_chain = RetrievalQA.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        verbose=True,
        input_key='question',
        output_key='answer',
        prompt=prompt,
    )
    return qa_chain

In [36]:
chain = retrieval_qa_chain()

In [37]:
chain.invoke('who is joe biden')



[1m> Entering new RetrievalQA chain...[0m


2025-12-09 19:22:30,345 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/embeddings "HTTP/1.1 200 OK"
2025-12-09 19:22:32,002 [INFO] - HTTP Request: POST https://api.sambanova.ai/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m


{'question': 'who is joe biden',
 'answer': 'Joe\u202fBiden is listed as one of the Presidents of the United States【reference:1】.',
 'source_documents': [Document(id='b17c3106-b1d5-482b-9033-494d3a30d9e7', metadata={'title': 'President of the United States - Wikipedia', 'language': 'en', 'source': 'https://en.wikipedia.org/wiki/President_of_the_United_States'}, page_content='* House elections\n  * Senate elections\n  * Gubernatorial elections\n\n  \n  \n  * v\n  * t\n  * e\n\nMemorials to, and namesakes of, presidents of the United States  \n---  \nBy president|\n\n  * Washington\n  * J. Adams\n  * Jefferson\n  * Madison\n  * Monroe\n  * J. Q. Adams\n  * Jackson\n  * Van Buren\n  * W. H. Harrison\n  * Tyler\n  * Polk\n  * Taylor\n  * Fillmore\n  * Pierce\n  * Buchanan\n  * Lincoln\n  * Grant\n  * Hayes\n  * Garfield\n  * Arthur\n  * Cleveland\n  * B. Harrison\n  * McKinley\n  * T. Roosevelt\n  * Wilson\n  * Harding\n  * Coolidge\n  * Hoover\n  * F. D. Roosevelt\n  * Truman\n  * Eisenho