In [1]:
import aiohttp
import asyncio

async def scrape(path: str) -> str:
    try:
        full_url = f"https://r.jina.ai/{path}"
        async with aiohttp.ClientSession() as session:
            async with session.get(full_url, timeout=10) as response:
                response.raise_for_status()
                return await response.text()
    except aiohttp.ClientError as e:
        print(f"Error occurred while fetching {full_url}: {e}")
        return ""

In [2]:
import os
from openai import AsyncOpenAI
from dotenv import load_dotenv

openai_api_key = os.environ.get('OPENAI_API_KEY')
if openai_api_key is None:
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise EnvironmentError("OPENAI_API_KEY is missing in environment variables.")

def openai_client():
    return AsyncOpenAI(api_key=openai_api_key)

In [3]:
async def llm(query: str) -> str:
    prompt = f"{query}"
    try:
        chat_completion = await openai_client().chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": prompt},
            ])
        return chat_completion.choices[0].message.content
    except Exception as e:
        logger.error(f"Error generating LLM response: {e}")
        raise HTTPException(status_code=500, detail="Failed to generate response")

In [4]:
async def split_input_prompt(input: str) -> str:
    instruction = """
    You are given an input string containing a URL and a question. Your task is to split the input into two parts: the URL and the question.
    The extracted question must retain full context and meaning from the original input. Ensure that no relevant information from the question is removed.
    The URL must always start with "https://". If the input does not include "https://" explicitly, prepend it to the extracted URL.

    Return the result in the following format, with no additional text or markdown:

    URL: <extracted_url>
    Question: <extracted_question>

    For example:
    Input: "Visit https://example.com and find out What is the purpose of this website?"
    Output:
    URL: https://example.com
    Question: What is the purpose of this website?

    Input: "What does https://evergrowadvisors.com/ do?"
    Output:
    URL: https://evergrowadvisors.com
    Question: What does evergrowadvisors do?

    Input: "is quicksell.co a product based company?"
    Output:
    URL: https://quicksell.co
    Question: Is quicksell a product based company?

    Strictly ensure the format matches the example provided, with the extracted URL and question on separate lines prefixed by "URL:" and "Question:".
    """
    prompt = f"{instruction}\nInput: \"{input}\""
    return prompt

In [5]:
import re
async def extract_url_and_query(response: str):
    match = re.search(r"URL: (.+)\nQuestion: (.+)", response)
    if match:
        base_url = match.group(1).strip()
        query = match.group(2).strip()
        return base_url, query
    else:
        raise ValueError("Response format does not match the expected pattern.")

In [6]:
async def generate_prompt(scraped_content: str, query: str) -> str:
    prompt = f"""
    Answer the query '{query}' based upon the scraped content '{scraped_content}'.
    Return the answer. If you don't find the answer in the scraped content, return the next URL to scrape 
    and go into it to find the answer based on the text of the hyperlink and text around the link.

    Format of response:
    Answer: <your_answer>
    Next URL: <next_url or None>

    If you found the answer, set 'Next URL' to 'None'.
    If you didn't find the answer, set 'Answer' to 'Not Found'.
    """
    return prompt

In [7]:
async def parse_response(response: str) -> tuple:
    match = re.search(r"Answer: (.+)\nNext URL: (.+)", response)
    if match:
        answer = match.group(1).strip()
        next_url = match.group(2).strip()
        return answer, next_url
    raise ValueError("Response format does not match the expected pattern.")

In [8]:
import logging
import re
from typing import Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("AI")

async def scrape_and_query(base_url: str, query: str) -> Optional[str]:
    current_url = base_url
    attempt_count = 0

    while current_url:
        attempt_count += 1
        logger.info("-" * 100)
        logger.info(f"Attempt {attempt_count}: Scraping content from URL: {current_url}")

        scraped_content = await scrape(current_url)
        if not scraped_content:
            logger.error(f"Failed to scrape content from {current_url}. Ending process.")
            break

        logger.info("Successfully scraped content. Generating LLM prompt...")

        prompt = await generate_prompt(scraped_content, query)

        try:
            logger.info("Sending prompt to LLM...")
            response = await llm(prompt)
            answer, next_url = await parse_response(response)

            logger.info(f"Response from LLM:\nAnswer: {answer}\nNext URL: {next_url}")

            if next_url.lower() == "none":
                if answer != "Not Found":
                    logger.info("Answer successfully found. Ending process.")
                    return answer
                else:
                    logger.warning("Answer not found in content. Ending process.")
                    return "Answer not found in content."

            if answer == "Not Found":
                logger.warning(f"Answer not found. Navigating to next URL: {next_url}\n")
                current_url = next_url
            else:
                logger.info("Answer successfully found. Stopping search.")
                current_url = None

        except Exception as e:
            logger.error(f"Error occurred during LLM query or Parsing: {e}")
            break

    logger.info("No answer found after multiple attempts.")
    return "No answer found after multiple attempts."

In [11]:
async def main():
    input = "visit iitr.ac.in and find the email of Head of Department of Computer Science and Engineering"
    try:
        split_prompt = await split_input_prompt(input)
        split_response = await llm(split_prompt)
        
        base_url, query = await extract_url_and_query(split_response)
        print("Base URL:", base_url)
        print("Query:", query)
                
        final_answer = await scrape_and_query(base_url, query)
        print("\nFinal Answer:", final_answer)
        
    except ValueError as e:
        print("Error during URL or query extraction:", e)
    except Exception as e:
        print("An unexpected error occurred:", e)

await main()

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:----------------------------------------------------------------------------------------------------
INFO:AI:Attempt 1: Scraping content from URL: https://iitr.ac.in


Base URL: https://iitr.ac.in
Query: Find the email of Head of Department of Computer Science and Engineering.


INFO:AI:Successfully scraped content. Generating LLM prompt...
INFO:AI:Sending prompt to LLM...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:Response from LLM:
Answer: Not Found
Next URL: https://iitr.ac.in/Departments/index.html

INFO:AI:----------------------------------------------------------------------------------------------------
INFO:AI:Attempt 2: Scraping content from URL: https://iitr.ac.in/Departments/index.html
INFO:AI:Successfully scraped content. Generating LLM prompt...
INFO:AI:Sending prompt to LLM...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:AI:Response from LLM:
Answer: Not Found
Next URL: https://iitr.ac.in/Departments/Computer%20Science%20and%20Engineering%20Department/index.html

INFO:AI:----------------------------------------------------------------------------------------------------
INFO:AI:Attempt 3: Scraping content from URL: https://iitr.ac.in/Departm


Final Answer: csed@iitr.ac.in
