In [2]:
import aiohttp
import asyncio

async def scrape(path: str) -> str:
    try:
        full_url = f"https://r.jina.ai/{path}"
        async with aiohttp.ClientSession() as session:
            async with session.get(full_url, timeout=10) as response:
                response.raise_for_status()
                return await response.text()
    except aiohttp.ClientError as e:
        print(f"Error occurred while fetching {full_url}: {e}")
        return ""

In [3]:
import logging
import re
from typing import Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("AI")

In [4]:
import os
from openai import AsyncOpenAI
from dotenv import load_dotenv

openai_api_key = os.environ.get('OPENAI_API_KEY')
if openai_api_key is None:
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise EnvironmentError("OPENAI_API_KEY is missing in environment variables.")

def openai_client():
    return AsyncOpenAI(api_key=openai_api_key)

In [5]:
async def llm(query: str) -> str:
    prompt = f"{query}"
    try:
        chat_completion = await openai_client().chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": prompt},
            ])
        return chat_completion.choices[0].message.content
    except Exception as e:
        logger.error(f"Error generating LLM response: {e}")
        raise HTTPException(status_code=500, detail="Failed to generate response")

In [6]:
async def split_input_prompt(input: str) -> str:
    instruction = """
    You are given an input string containing a URL and a question. Your task is to split the input into two parts: the URL and the question.
    The extracted question must retain full context and meaning from the original input. Ensure that no relevant information from the question is removed.
    The URL must always start with "https://". If the input does not include "https://" explicitly, prepend it to the extracted URL.
    If no URL is present in the input, provide the query to search for the URL.

    Return the result in the following format, with no additional text or markdown:

    URL: <extracted_url or query_to_search>
    Question: <extracted_question>

    For example:
    Input: "Visit https://example.com and find out What is the purpose of this website?"
    Output:
    URL: https://example.com
    Question: What is the purpose of this website?

    Input: "What does https://evergrowadvisors.com/ do?"
    Output:
    URL: https://evergrowadvisors.com
    Question: What does evergrowadvisors do?

    Input: "What does Evergrow Advisors do?"
    Output: URL: Evergrow Advisors
    Question: What does Evergrow Advisors do?

    Input: "is quicksell.co a product based company?"
    Output:
    URL: https://quicksell.co
    Question: Is quicksell a product based company?

    Strictly ensure the format matches the example provided, with the extracted URL and question on separate lines prefixed by "URL:" and "Question:".
    """
    prompt = f"{instruction}\nInput: \"{input}\""
    return prompt

In [7]:
import re
async def extract_url_and_query(response: str):
    match = re.search(r"URL: (.+)\nQuestion: (.+)", response)
    if match:
        base_url = match.group(1).strip()
        query = match.group(2).strip()
        return base_url, query
    else:
        raise ValueError("Response format does not match the expected pattern.")

In [8]:
from googlesearch import search

async def google_search(query: str, num_results: int = 10) -> list:
    results = search(query, num_results=num_results, lang="en")
    return list(results)

In [9]:
async def generate_prompt(scraped_content: str, query: str) -> str:
    prompt = f"""
    Based on the scraped content provided below, answer the query strictly following the format outlined. 

    Query: '{query}'
    Scraped Content: '{scraped_content}'
    
    Return the answer. If you don't find the answer in the scraped content, return the next URL to scrape 
    and go into it to find the answer based on the text of the hyperlink and text around the link.

    Instructions:
    1. Return the response in plain text only. Do not use any special formatting such as markdowns or bullet points.
    2. The response must strictly follow this format:
       Answer: <your_answer>
       Next URL: <next_url or None>
    3. If the answer is found in the scraped content, provide it under 'Answer' and set 'Next URL' to 'None'.
    4. If the answer is not found in the scraped content but there is a next URL to explore, set 'Answer' to 'Not Found' and provide the 'Next URL' to explore further.
    5. If the answer is not found in the scraped content and there is no URL to go next, set:
       Answer: Not Found
       Next URL: None
    6. The response must adhere strictly to the format without any deviation.
    """
    return prompt

In [10]:
async def parse_response(response: str) -> tuple:
    match = re.search(r"Answer: (.+)\nNext URL: (.+)", response)
    if match:
        answer = match.group(1).strip()
        next_url = match.group(2).strip()
        return answer, next_url
    raise ValueError("Response format does not match the expected pattern.")

In [11]:
async def scrape_and_query(google_results: list, query: str, skip_urls: list) -> Optional[str]:
    attempt_count = 0
    visited_urls = set()

    for current_url in google_results:

        while current_url:
            if current_url in visited_urls:
                logger.warning(f"URL {current_url} Has Already Been Visited. Skipping...\n")
                break

            if current_url in skip_urls:
                logger.warning(f"URL {current_url} Is In The Skip List. Skipping...\n")
                break

            attempt_count += 1
            logger.info("-" * 100)
            logger.info(f"Attempt {attempt_count}:\nScraping Content From URL: {current_url}")

            visited_urls.add(current_url)

            scraped_content = await scrape(current_url)
            if not scraped_content:
                logger.error(f"Failed To Scrape Content From {current_url}. Trying Next URL.")
                break

            logger.info("Successfully Scraped Content.")
            logger.info("Generating LLM prompt...")

            prompt = await generate_prompt(scraped_content, query)

            try:
                logger.info("Sending Prompt To LLM...")
                response = await llm(prompt)
                answer, next_url = await parse_response(response)

                logger.info(f"Response From LLM:\nAnswer: {answer}\nNext URL: {next_url}")

                if next_url.lower() == "none" and answer == "Not Found":
                    logger.warning("Answer Not Found In Content.\nTrying Next URL In Google Results.\n")
                    break

                if next_url.lower() == "none":
                    if answer != "Not Found":
                        logger.info("Answer Successfully Found. Ending Process.")
                        return answer

                if next_url == current_url:
                    logger.warning(f"Next URL Is The Same As The Current URL: {next_url}.\nSkipping To Next URL In Google Results.\n")
                    break

                if answer == "Not Found" and next_url.lower() != "none":
                    logger.warning(f"Answer Not Found.\nNavigating to next URL: {next_url}\n")
                    
                current_url = next_url

            except Exception as e:
                logger.error(f"Error Occurred During LLM Query Or Parsing: {e}\n")
                break

    logger.info("No Answer Found After Attempting All Google Search Results.")
    return "No Answer Found After Multiple Attempts."

In [13]:
async def main():
    input = """Who Is The Founder Of HPC Links?"""
    skip_urls = []
    try:
        split_prompt = await split_input_prompt(input)
        split_response = await llm(split_prompt)
        
        base_url, query = await extract_url_and_query(split_response)
        
        if not base_url.startswith("http"):
            logger.info("Attempting To Find The URL Via Google Search...\n")
            google_results = await google_search(base_url, num_results=10)
        else:
            google_results = [base_url] + await google_search(base_url, num_results=10)

        print("Base URL:", base_url)
        print("Query:", query)
                
        final_answer = await scrape_and_query(google_results, query, skip_urls)
        print("\nFinal Answer:", final_answer)
        
    except ValueError as e:
        print("Error During URL Or Query Extraction:", e)
    except Exception as e:
        print("An Unexpected Error Occurred:", e)

await main()

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:Attempting To Find The URL Via Google Search...

INFO:AI:----------------------------------------------------------------------------------------------------
INFO:AI:Attempt 1:
Scraping Content From URL: http://www.hpclinks.com/


Base URL: HPC Links
Query: Who is the founder of HPC Links?


INFO:AI:Successfully Scraped Content.
INFO:AI:Generating LLM prompt...
INFO:AI:Sending Prompt To LLM...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:Response From LLM:
Answer: Not Found
Next URL: http://www.hpclinks.com/about/index.shtml
Navigating to next URL: http://www.hpclinks.com/about/index.shtml

INFO:AI:----------------------------------------------------------------------------------------------------
INFO:AI:Attempt 2:
Scraping Content From URL: http://www.hpclinks.com/about/index.shtml
INFO:AI:Successfully Scraped Content.
INFO:AI:Generating LLM prompt...
INFO:AI:Sending Prompt To LLM...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:Response From LLM:
Answer: Ashwini Kumar Nanda, Ph.D.
Next URL: None
INFO:AI:Answer Successfully Found. Ending Process.



Final Answer: Ashwini Kumar Nanda, Ph.D.
