In [17]:
%pip install requests beautifulsoup4 google-generativeai python-dotenv

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
import google.generativeai as genai
from urllib.parse import urljoin, urlparse

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [21]:
class Website:
    def __init__(self, url: str):
        self.url = url
        resp = requests.get(url)
        resp.raise_for_status()  # Raise an exception for failed HTTP requests
        soup = BeautifulSoup(resp.content, 'html.parser')
        self.title = soup.title.string if soup.title else 'No title'
        
        # Remove unwanted tags
        for tag in soup.body(["script", "style", "img", "input"]):
            tag.decompose()
        self.text = soup.body.get_text(separator='\n', strip=True)
        
        # Extract internal and external links
        self.domain = urlparse(url).netloc
        self.internal_links = []
        self.external_links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            # Resolve relative URLs to absolute
            absolute_url = urljoin(url, href)
            # Validate URL
            parsed = urlparse(absolute_url)
            if parsed.scheme in ('http', 'https') and parsed.netloc:
                if parsed.netloc == self.domain:
                    self.internal_links.append(absolute_url)
                else:
                    self.external_links.append(absolute_url)
        # Remove duplicates while preserving order
        self.internal_links = list(dict.fromkeys(self.internal_links))
        self.external_links = list(dict.fromkeys(self.external_links))

In [22]:
system_prompt = (
    "You are an assistant summarizing a website. Provide a short Markdown summary, "
    "ignoring navigation text, and include any news or announcements. "
    "Also list a few example internal and external links from the website, if available, "
    "in the format:\n\n"
    "**Internal Links:**\n"
    "- [Link Text](URL)\n\n"
    "**External Links:**\n"
    "- [Link Text](URL)"
)

In [23]:
def user_prompt_for(site: Website) -> str:
    # Get a few example links (up to 5 each to avoid overwhelming the prompt)
    internal_links = site.internal_links[:5]
    external_links = site.external_links[:5]
    
    internal_links_md = "\n".join([f"- [{link}]({link})" for link in internal_links]) if internal_links else "- None"
    external_links_md = "\n".join([f"- [{link}]({link})" for link in external_links]) if external_links else "- None"
    
    return (
        f"Website title: **{site.title}**\n\n"
        "Content:\n\n"
        f"{site.text}\n\n"
        "Internal Links:\n"
        f"{internal_links_md}\n\n"
        "External Links:\n"
        f"{external_links_md}"
    )

In [24]:
def summarize_with_gemini(url: str, model: str = "gemini-1.5-flash") -> str:
    site = Website(url)
    model = genai.GenerativeModel(model)
    prompt = f"{system_prompt}\n\n{user_prompt_for(site)}"
    response = model.generate_content(prompt)
    return response.text

In [25]:
def display_and_save_summary(url: str, output_file: str = "summary.md"):
    summary = summarize_with_gemini(url)
    display(Markdown(summary))  # Display in notebook
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(summary)  # Save to Markdown file

In [26]:
display_and_save_summary("https://www.geeksforgeeks.org/")

# GeeksforGeeks Summary

GeeksforGeeks is an all-in-one learning portal offering a wide range of courses, tutorials, and practice problems covering various computer science topics.  These include data structures and algorithms (DSA), programming languages (Java, Python, C++, JavaScript, etc.), web development, data science, machine learning, DevOps, and more.  The site also features interview preparation resources, job postings, and a placement training program.  Several courses are highlighted, including DSA focused courses, full-stack development courses, and backend development courses, all with student ratings and enrollment numbers.  The site also provides learning resources for school students (classes 9-12).


**Internal Links:**
- [Learn DSA](https://www.geeksforgeeks.org/learn-data-structures-and-algorithms-dsa-tutorial/)
- [Explore](https://www.geeksforgeeks.org/explore)
- [C Programming](https://www.geeksforgeeks.org/c-programming-language/)
- [C++ Programming](https://www.geeksforgeeks.org/c-plus-plus/)
- [Home](https://www.geeksforgeeks.org/)


**External Links:**
- [DSA Self-Paced Course](https://practice.geeksforgeeks.org/courses/dsa-self-paced)
- [Facebook](https://www.facebook.com/geeksforgeeks.org/)
- [Instagram](https://www.instagram.com/geeks_for_geeks/)
- [LinkedIn](https://in.linkedin.com/company/geeksforgeeks)
- [Twitter](https://twitter.com/geeksforgeeks)

There are no explicit news or announcements on the provided website text.
