In [3]:
# Import Libraries
import os
import requests
from openai import OpenAI

from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException
from dotenv import load_dotenv

from IPython.display import Markdown, display

# Load .env variables
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not openai_api_key:
    raise ValueError("⚠️ OPENAI_API_KEY not found in .env file.")

# Generating object to work with GPT tasks 
openai = OpenAI()

# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
    """
    def __init__(self, url: str, summary_detail: str = "high", show_summary: bool = True, language_of_reference = "English") -> None:

        # Initial summarizer settings
        self.url = url
        self.show_summary = show_summary
        self.summary_detail = summary_detail
        self.language_of_reference = language_of_reference
        self.language_code_map = {
            "english": "en",
            "portuguese": "pt",
            "spanish": "es",
            "french": "fr",
            "german": "de",
            "italian": "it",
            "japanese": "ja",
            "chinese": "zh",
            "korean": "ko",
        }
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        if self.summary_detail not in ["high", "low"]:
            raise Exception("""Please select summary detail as either "high" or "low".""")

    def __extract_text(self):
        response = requests.get(self.url, headers=self.headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
            
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        # Guarantee limit of text to summary
        max_words = 7000
        if len(self.text.split()) > max_words:
            self.text = " ".join(self.text.split()[:max_words])
    
    def __detect_language(self):
        # Detect language
        try:
            self.language_url = detect(self.text)
        except LangDetectException:
            self.language_url = "unknown"

        # Normalize and resolve target language code
        target_language_name = self.language_of_reference.lower().strip()
        self.target_language_code = self.language_code_map.get(target_language_name)
        
        if not self.target_language_code:
            raise ValueError(f"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}")

        print(f"🌍 Detected language: {self.language_url.upper()}")
        
        if self.show_summary:
            print("🔗 Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}\n\n")

    def summarize(self)-> str:
        """Method to process user prompts in the context of the user."""
        self.__extract_text()
        self.__detect_language()
        
        # Prompt for system definition
        self.system_prompt = f""" 
        You are an assistant that analyzes the contents of a website and provides a summary. 
        Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. Also, can you please start your summary 
        with the tile "📝 Summary"?
        
        Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. 
        """

        self.content = f"""The text to summarize is as follows: {self.text}"""

        if self.language_url != self.target_language_code:
            self.system_prompt = f"""The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. 
            {self.system_prompt.strip()}
            """

        response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"system", "content":self.system_prompt}, 
                                                                                 {"role": "user",  "content":self.content}])
        return response.choices[0].message.content


web_page_summarizer = WebPageSummarizer("http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada", summary_detail = "low")
print(web_page_summarizer.summarize())

🌍 Detected language: PT
🔗 Preview of extracted text:

ITASAT2 irá atuar para aplicações científicas e de defesa
Publicado em 14/04/2025 - 14h15
O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.
Participaram do encontro representantes do
...

Amount of words: 526


# 📝 Summary

Exciting developments are happening at the Instituto Tecnológico de Aeronáutica (ITA) with the ITASAT 2 project, a new microsatellite being designed for scientific and defense applications! 🚀 Between March 17-19, 2025, the Preliminary Design Review (PDR) was conducted, marking a significant milestone in the project. The review involved various partners, including prominent institutions like NASA and t