<a href="https://colab.research.google.com/github/rmit-ir/Tutotrial-Practical-LLMs/blob/main/LLM_Tutorial_Challenge2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Required Python Packages

This notebook demonstrates how to use the OpenRouter and SerpApi APIs to perform web search and content analysis. Before we begin, we need to install several essential Python packages:

1. **serpapi** - A client library for SerpAPI service to access Google search results programmatically
2. **selenium** - For web browser automation and content scraping
3. **webdriver-manager** - To help manage browser drivers for Selenium

These packages are necessary for:
- Retrieving search results from Google
- Scraping web content
- Processing and analyzing the retrieved data

In [25]:
# Install the SerpApi library, used to scrape search engine results pages (SERPs)
%pip install serpapi

# Install the Selenium library, used for web browser automation and the WebDriver Manager, which helps manage browser drivers
%pip install selenium webdriver-manager



In [26]:
# Import the JSON module for handling JSON data
import json
import textwrap  # Used for formatting and wrapping text, useful for displaying text in a readable way

# Import pandas for data manipulation and analysis
import pandas as pd
# Import the requests library for making HTTP requests to APIs
import requests
# Import the SerpAPI client for Google search results
import serpapi
# Provides access to user-specific information in Google Colab, used to access the user's secret API key
from google.colab import (
    userdata,
)

In [27]:
VERBOSE = 0  # 'VERBOSE' controls the level of logging or output that is displayed (0: no output, 1: some output, 2: all output)

# set line wrap for print, lower for smaller screens
WRAP = 100  # Defines the maximum line width for wrapping text
printw = lambda x: print(
    textwrap.fill(x, WRAP)
)  # Create a lambda function that wraps text to fit within the specified width (WRAP)

In [28]:
# Test that the API keys are set and accessible
assert (
        userdata.get("OPENROUTER_API_KEY") is not None
), "Please set your OPENROUTER_API_KEY key in user secrets and allow access to it."

assert userdata.get("SERP_API_KEY"), "Please set your SERP_API_KEY in user secrets."

OPENROUTER_API_KEY = userdata.get("OPENROUTER_API_KEY")
SERP_API_KEY = userdata.get("SERP_API_KEY")

In [29]:
def fetch_documents_with_serpapi(query, num_results=10, verbose=VERBOSE):
    """
    Fetch documents from SerpApi using the Google Light API.
    Args:
        query (str): The search query.
        num_results (int): The number of results to retrieve.
    Returns:
        pd.DataFrame: A DataFrame containing the search results.
    """

    # Define the search parameters, more info at https://serpapi.com/google-light-api#api-parameters
    params = {
        "engine": "google_light",
        "q": query,  # Search query
        "num": num_results,  # Max number of results to retrieve
        "google_domain": "google.com",  # Google domain to use for the search
        "hl": "en",  # Language code
        "gl": "us",  # Country code
        "api_key": SERP_API_KEY  # Your SerpApi API key
    }

    serp = serpapi.search(params)  # Perform the search using the SerpApi client
    organic_results = serp.get("organic_results",
                               {'position': None})  # Extract the organic results from the search response

    if verbose > 0:
        printw(f"SerpApi returned {len(organic_results)} results for query: {query}")
    if verbose > 1:
        print(f"SerpApi results: {json.dumps(organic_results, indent=2)}")

    return pd.DataFrame(organic_results).set_index('position')[['title', 'link', 'snippet']]

In [30]:
# Example usage: Search for "Python"
query = "Python"
documents_df = fetch_documents_with_serpapi(query)

# Display the results
documents_df

Unnamed: 0_level_0,title,link,snippet
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Welcome to Python.org,https://www.python.org/,Python is a programming language that lets you...
2,Python Tutorial - W3Schools,https://www.w3schools.com/python/,Learn Python. Python is a popular programming ...
3,"Online Python - IDE, Editor, Compiler, Interpr...",https://www.online-python.com/,Build and Run your Python code instantly. Onli...
4,python/cpython: The Python programming languag...,https://github.com/python/cpython,"Documentation for Python 3.14 is online, updat..."
5,Python - Visual Studio Marketplace,https://marketplace.visualstudio.com/items?ite...,The Python extension provides pluggable access...
6,Python - Wikipedia,https://en.wikipedia.org/wiki/Python,"Computing · Python (programming language), a w..."
7,Python (programming language) - Wikipedia,https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra..."


In [31]:
# Core Selenium package for browser automation
from selenium import webdriver
# Chrome-specific options for configuring the browser session
from selenium.webdriver.chrome.options import Options
# Locator strategies for finding elements on the page
from selenium.webdriver.common.by import By
# Import WebDriverWait and expected conditions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Import time module for sleep functionality
import time


def fetch_and_parse_webpage(url, max_retries=2, timeout=30):
    """
    Fetch and parse webpage with improved timeout handling and retries.

    Args:
        url (str): The webpage URL to fetch
        max_retries (int): Number of retry attempts
        timeout (int): Page load timeout in seconds
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    # Add performance options
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.page_load_strategy = 'eager'  # Wait for the page to load completely

    for attempt in range(max_retries):
        try:
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(timeout)
            driver.set_script_timeout(timeout)

            # Load page with explicit wait
            driver.get(url)
            WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            # List of CSS selectors for elements to remove (navigation, menus, etc.)
            ignore_elements = [
                "nav",  # Navigation bars
                "header",  # Site headers
                "footer",  # Site footers
                "menu",  # Menu elements
                '[role="navigation"]',  # ARIA navigation roles
                '[role="banner"]',  # ARIA banner roles (headers)
                '[role="complementary"]',  # ARIA sidebars/complementary content
                ".sidebar",  # Sidebar classes
                "#navigation",  # Navigation IDs
                ".menu",  # Menu classes
                ".nav",  # Nav classes
            ]

            # Remove all non-content elements from the page
            for selector in ignore_elements:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    try:
                        driver.execute_script("arguments[0].remove()", element)
                    except:
                        continue

            # Prioritized list of selectors for main content areas
            content_selectors = [
                "article",  # Standard article tag
                '[role="main"]',  # ARIA main content role
                ".post-content",  # Common content class
                ".article-content",  # Common article class
                "main",  # HTML5 main tag
                "#content",  # Common content ID
            ]

            # Try each content selector in order until we find content
            content = None
            for selector in content_selectors:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    content = elements[0].text
                    break

            # If no content found with specific selectors, get all body text
            if not content:
                content = driver.find_element(By.TAG_NAME, "body").text

            driver.quit()

            # Clean and format the extracted text
            lines = [line.strip() for line in content.split("\n")]
            # Remove short lines (likely UI elements) and empty lines
            lines = [
                line for line in lines if line and len(line) > 20
            ]
            return "\n".join(lines)

        except requests.exceptions.Timeout:
            print(f"Timeout occurred for the URL: {url}")
            driver.quit()
            if attempt == max_retries - 1:
                return "Failed to fetch the webpage due to timeout."
            chrome_options.page_load_strategy = 'eager'  # Reset to eager strategy
            time.sleep(3)  # Wait before retrying
            print(f"Retrying... ({attempt + 1}/{max_retries})")
            continue
        except Exception as e:
            print(f"Attempt {attempt + 1}/{max_retries} failed for the URL: {url}")
            print("Error:", str(e))
            driver.quit()
            if attempt == max_retries - 1:
                return "Failed to fetch the webpage."
            time.sleep(3)  # Wait before retrying
            print(f"Retrying... ({attempt + 1}/{max_retries})")
            continue

    return "Failed to fetch the webpage."


documents_df['content'] = documents_df['link'].apply(fetch_and_parse_webpage)
documents_df

Unnamed: 0_level_0,title,link,snippet,content
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Welcome to Python.org,https://www.python.org/,Python is a programming language that lets you...,"Join us in Pittsburgh, PA starting May 14, 202..."
2,Python Tutorial - W3Schools,https://www.w3schools.com/python/,Learn Python. Python is a popular programming ...,HTML CSS JAVASCRIPT SQL PYTHON JAVA PHP HOW TO...
3,"Online Python - IDE, Editor, Compiler, Interpr...",https://www.online-python.com/,Build and Run your Python code instantly. Onli...,🚀 Upgrade your coding experience! Try our new ...
4,python/cpython: The Python programming languag...,https://github.com/python/cpython,"Documentation for Python 3.14 is online, updat...",This is Python version 3.14.0 alpha 6\nCopyrig...
5,Python - Visual Studio Marketplace,https://marketplace.visualstudio.com/items?ite...,The Python extension provides pluggable access...,Visual Studio Code>Programming Languages>Pytho...
6,Python - Wikipedia,https://en.wikipedia.org/wiki/Python,"Computing · Python (programming language), a w...","From Wikipedia, the free encyclopedia\nLook up..."
7,Python (programming language) - Wikipedia,https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...","From Wikipedia, the free encyclopedia\nParadig..."


In [32]:
# Display the first 1000 characters of the content for first 5 documents
for text in documents_df["content"].head():
    printw(text[:1000])  # Show first 1000 characters
    print("-" * 80)

Join us in Pittsburgh, PA starting May 14, 2025. Grab your ticket today before we sell out!
REGISTER FOR PYCON US! Whether you're new to programming or an experienced developer, it's easy to
learn and use Python. Start with our Beginner’s Guide Python source code and installers are
available for download for all versions! Latest: Python 3.13.2 Documentation for Python's standard
library, along with tutorials and guides, are available online. Looking for work or have a Python
related position that you're trying to hire for? Our relaunched community-run job board is the place
to go. Thanks to the flexibility of Python and the powerful ecosystem of packages, the Azure CLI
supports features such as autocompletion (in shells that support it), persistent credentials,
JMESPath result parsing, lazy initialization, network-less unit tests, and more. Building an open-
source and cross-platform Azure CLI with Python by Dan Taylor >>> Python Software Foundation The
mission of the Python Software
-

In [33]:
def get_response(
        prompt: str, model: str, verbose: int = VERBOSE, **model_kwargs
) -> dict:
    """
    Get a response from the OpenRouter API using the given prompt and model.
    Make sure to set your OpenRouter API key in the environment variable
    OPENROUTER_API_KEY. OpenRouter normalizes requests and responses across
    providers. That is, you can use the same code to call different models from
    different providers.
    Args:
        prompt (str): The prompt to send to the model.
        model (str): The model to use.
        verbose (int): Verbosity level for debugging.
        **model_kwargs: Additional keyword arguments for the model.
            - top_p: Top-p sampling parameter.
            - temperature: Temperature parameter for sampling.
            - frequency_penalty: Frequency penalty parameter.
            - presence_penalty: Presence penalty parameter.
            - repetition_penalty: Repetition penalty parameter.
            - top_k: Top-k sampling parameter.
            - max_tokens: Maximum number of tokens to generate.
    Note: The model_kwargs parameters are optional and will be set to default values if not provided.
    Returns:
        dict: The response from the model.
    """
    # Check if model parameter is provided, if not, set a default value.
    # More information about the parameters can be found in the OpenRouter API documentation.
    # https://openrouter.ai/docs/api-reference/parameters
    top_p = model_kwargs.get("top_p", 1)
    temperature = model_kwargs.get("temperature", 0.9)
    frequency_penalty = model_kwargs.get("frequency_penalty", 0)
    presence_penalty = model_kwargs.get("presence_penalty", 0)
    repetition_penalty = model_kwargs.get("repetition_penalty", 1)
    top_k = model_kwargs.get("top_k", 0)
    max_tokens = model_kwargs.get("max_tokens", 1000)

    messages = [{"role": "user", "content": prompt}]

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={"Authorization": f"Bearer {userdata.get('OPENROUTER_API_KEY')}"},
        data=json.dumps(
            {
                "model": model,
                "messages": messages,
                "top_p": top_p,
                "temperature": temperature,
                "frequency_penalty": frequency_penalty,
                "presence_penalty": presence_penalty,
                "repetition_penalty": repetition_penalty,
                "top_k": top_k,
                "max_tokens": max_tokens,
            }
        ),
    )
    if verbose > 0:
        print(f"Response status code: {response.status_code}")
    response_json = response.json()
    # let's print how many tokens we used, it can be useful for cost estimation
    if verbose > 0:
        print(f"Response usage: {response_json.get('usage')}")
    return response_json

In [None]:
# TODO: Continue from here
def generate_documents_summary(
        documents_df: pd.DataFrame,
        prompt: str,
        model: str,
        verbose: int = VERBOSE,
) -> tuple:
    """
    Generate relevance predictions using the OpenRouter API.
    Args:
        qrel_df (DataFrame): The qrel DataFrame.
        docs (DataFrame or Lz4FullStore): The document store.
        query_df (DataFrame): The query DataFrame.
        prompt (str): The prompt template.
        model (str): The model to use.
        verbose (int): Verbosity level for debugging.
    Returns:
        dict: The relevance predictions.
        dict: The reasoning results.
        list: The raw responses from the model.
    """
    results = {}
    reasoning_results = {}
    raw_responses = []  # for debugging, logging and backup

    for index, row in documents_df.iterrows():
        qid, doc_id = row["query_id"], row["doc_id"]
        if isinstance(docs, ir_datasets.indices.lz4_pickle.PickleLz4FullStore):
            doc = docs.get(doc_id)
        else:
            doc = docs.loc[doc_id]
        prompt_text = prompt.format(
            query=query_df.loc[qid, "text"], document=doc.text
        )  # adding a query and document to the prompt
        if verbose > 0:
            print(f"Running for {qid, doc_id}")
            if verbose > 1:
                print(f"Prompt for {qid, doc_id}:")
                printw(prompt_text)
        response_json = get_response(prompt=prompt_text, model=model, verbose=verbose)
        raw_responses.append(response_json)
        response_message = response_json.get("choices", {0: {"message": None}})[0][
            "message"
        ]
        if response_message is None:
            print(f"No response message for {qid, doc_id}")
            print(response_json.get("error"))
            continue
        if verbose > 1:
            print(f"Response for {qid, doc_id}:")
            printw(f"Response message: {response_message}")
            print("\n", "-=" * 5, " End of response ", "=-" * 5, "\n")
        response_content = response_message.get("content", None)
        reasoning_results[qid, doc_id] = response_message.get("reasoning", None)
        results[qid, doc_id] = response_content
    return results, reasoning_results, raw_responses

In [35]:
def summarize_document(document_text):
    """
    Summarizes the document using the OpenRouter API.

    Args:
        document_text (str): The text of the document to be summarized.
        OPENROUTER_API_KEY: The API key for OpenRouter authentication.

    Returns:
        str: The summary of the document, or None if an error occurs.
    """

    # Define the API endpoint for OpenRouter
    openrouter_endpoint = "https://openrouter.ai/api/v1/chat/completions"

    # Define request headers with API authorization
    headers = {
        "Authorization": f"Bearer {userdata.get('OPENROUTER_API_KEY')}",
        "Content-Type": "application/json",
    }

    # Define the request payload
    payload = {
        "model": "gpt-4o-mini",  # Model selection
        "messages": [
            {
                "role": "system",
                "content": "You are an AI assistant that summarizes documents.",
            },
            {
                "role": "user",
                "content": f"Summarize the following text:\n\n{document_text}",
            },
        ],
        "max_tokens": 200,  # Limit response length
        "temperature": 0.7,  # Control randomness
    }

    try:
        # Send the request to OpenRouter API
        response = requests.post(openrouter_endpoint, headers=headers, json=payload)

        # Parse the response
        if response.status_code == 200:
            summary = (
                response.json()
                .get("choices", [{}])[0]
                .get("message", {})
                .get("content", "")
                .strip()
            )
            return summary
        else:
            print(
                f"Error summarizing document: {response.status_code} - {response.text}"
            )
            return None

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [40]:
# Loop through each document and summarize it
documents_df["summary"] = documents_df["content"].apply(lambda x: summarize_document(x))

# Display the documents with their summaries
documents_df[["title", "link", "content", "summary"]].head()

Unnamed: 0_level_0,summary
position,Unnamed: 1_level_1
1,"Join us in Pittsburgh, PA for PyCon US startin..."
2,The text provides an overview of resources ava...
3,"The text promotes the OnlineIDE Pro, an enhanc..."
4,The text outlines the release information and ...
5,The Python extension for Visual Studio Code pr...
6,The text is a disambiguation page listing vari...
7,"Python is a high-level, general-purpose progra..."


In [64]:
all_documents = 'Document title: ' + documents_df['title'] + '\n' + 'Document Content: ' + documents_df[
    'content'] + '\n\n'
printw(summarize_document(all_documents.sum()))

The provided documents cover various aspects of Python, a popular high-level programming language
designed for readability and ease of use. The key points are summarized as follows:  1. **Python.org
and PyCon**: The official Python website encourages participation in PyCon US, scheduled for May 14,
2025, in Pittsburgh, PA. It highlights Python's accessibility for beginners and its robust
ecosystem, including the latest version (3.13.2) and various resources for documentation, job
opportunities, and community support.  2. **W3Schools Python Tutorial**: This tutorial promotes
Python as a server-side language for web applications, featuring interactive coding exercises. It
covers file handling, database integration (MySQL and MongoDB), and provides a platform for users to
practice and assess their skills.  3. **Online Python IDE**: This tool allows users to write, run,
and share Python code online without local setup. It emphasizes Python's ease of use for various
applications, including 

# Display results in Google-like way

In [49]:
def create_google_serp_like_page(documents_df):
    """
    Display a Google-like search results page from the fetched documents.

    Args:
        documents_df (pd.DataFrame): DataFrame containing the document titles, links, and snippets.
    """

    # Create an HTML structure for displaying the search results in a Google-like layout
    html_content = """
    <html>
    <head>
        <title>Google Search Results</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f9f9f9;
            }
            .search-results {
                max-width: 800px;
                margin: auto;
                background-color: white;
                padding: 20px;
                box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);
                border-radius: 8px;
            }
            .result-item {
                margin-bottom: 20px;
            }
            .result-title {
                font-size: 20px;
                color: #1a0dab;
                text-decoration: none;
            }
            .result-title:hover {
                text-decoration: underline;
            }
            .result-snippet {
                color: #4d5156;
                font-size: 14px;
                margin-top: 5px;
            }
            .result-summary {
                color: #4d5156;
                font-size: 14px;
                margin-top: 5px;
            }
            .result-link {
                color: #006621;
                font-size: 14px;
            }
            .result-link:hover {
                text-decoration: underline;
            }
            .search-bar {
                background-color: #f8f9fa;
                padding: 10px;
                margin-bottom: 20px;
                border-radius: 8px;
                box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1);
            }
            .search-bar input {
                width: 100%;
                padding: 10px;
                font-size: 16px;
                border-radius: 4px;
                border: 1px solid #ddd;
            }
        </style>
    </head>
    <body>
        <div class="search-results">

    """

    # Loop through each document and create a search result item
    for index, row in documents_df.iterrows():
        title = row["title"]
        link = row["link"]
        snippet = row["snippet"]
        summary = row["summary"]

        html_content += f"""
            <div class="result-item">
                <a class="result-title" href="{link}" target="_blank">{title}</a>
                <div class="result-snippet">{snippet}</div>
                <a class="result-link" href="{link}" target="_blank">{link}</a>
                <div class="result-summary">{summary}</div>
            </div>
        """

    # Close the HTML tags
    html_content += """
        </div>
    </body>
    </html>
    """

    # Display the HTML content in the notebook
    display(HTML(html_content))


# Example usage: Display the results from the fetched documents
create_google_serp_like_page(documents_df)