In [5]:
# Import Libraries
import os
import requests
from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException

from IPython.display import Markdown, display

# Load .env variables
ollama_api = "http://localhost:11434/api/chat"
headers = {"Content-Type": "application/json"}
model = "llama3"

# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
    """
    def __init__(self, url: str, summary_detail: str = "high", show_summary: bool = True, language_of_reference = "English", model: str = "gpt-4o-mini") -> None:

        # Initial summarizer settings
        self.url = url
        self.model = model
        self.show_summary = show_summary
        self.summary_detail = summary_detail
        self.language_of_reference = language_of_reference
        self.language_code_map = {
            "english": "en",
            "portuguese": "pt",
            "spanish": "es",
            "french": "fr",
            "german": "de",
            "italian": "it",
            "japanese": "ja",
            "chinese": "zh",
            "korean": "ko",
        }
        
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        if self.summary_detail not in ["high", "low"]:
            raise Exception("""Please select summary detail as either "high" or "low".""")

    def __extract_text(self):
        response = requests.get(self.url, headers=self.headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
            
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        # Guarantee limit of text to summary
        max_words = 7000
        if len(self.text.split()) > max_words:
            self.text = " ".join(self.text.split()[:max_words])
    
    def __detect_language(self):
        # Detect language
        try:
            self.language_url = detect(self.text)
        except LangDetectException:
            self.language_url = "unknown"

        # Normalize and resolve target language code
        target_language_name = self.language_of_reference.lower().strip()
        self.target_language_code = self.language_code_map.get(target_language_name)
        
        if not self.target_language_code:
            raise ValueError(f"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}")

        print(f"🌍 Detected language: {self.language_url.upper()}")
        
        if self.show_summary:
            print("🔗 Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}\n")

    def summarize(self)-> str:
        """
        Method to process user prompts in the context of the user.
        """
        self.__extract_text()
        self.__detect_language()
        
        # Prompt for system definition
        self.system_prompt = f""" 
        You are an assistant that analyzes the contents of a website and provides a summary. 
        Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. 
        Also, can you please start your summary with the tile "📝 Summary"?
        
        Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. 
        """

        self.content = f"""The text to summarize is as follows: {self.text}"""

        if self.language_url != self.target_language_code:
            self.system_prompt = f"""The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. 
            {self.system_prompt.strip()}
            """

        messages=[{"role":"system", "content":self.system_prompt}, 
                  {"role": "user",  "content":self.content}]

        response = requests.post(ollama_api, json = {"model": model, "messages": messages, "stream": False}, headers = headers)

        # Cost calculation and usage report
        return response.json()["message"]["content"]


web_page_summarizer = WebPageSummarizer("http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada", summary_detail = "low")
display(Markdown(web_page_summarizer.summarize()))

🌍 Detected language: PT
🔗 Preview of extracted text:

ITASAT2 irá atuar para aplicações científicas e de defesa
Publicado em 14/04/2025 - 14h15
O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.
Participaram do encontro representantes do
...

Amount of words: 526



📝 Summary
Oh boy, get ready for some exciting space-related news! 🚀 The Instituto Tecnológico de Aeronáutica (ITA) has just concluded the Preliminary Design Review (PDR) of their new microsatellite project, ITASAT 2! 🎉 This innovative satellite is expected to launch in the coming years and will be a constellation of three cubesats responsible for monitoring the Earth's ionosphere. 🔭 Not only that, but it'll also enable geolocation of radiofrequency sources on land and sea, as well as optical identification of non-collaborative vessels! 🚢

The PDR was attended by representatives from various institutions, including ITA, CENSIPAM, DGDNTM, AEB, INPE, FINEP, NASA, Comando Sul dos Estados Unidos, NRL, and Utah State University. 🌎 This event marks an important milestone in the project's development, showcasing the team's understanding of the objectives and their ability to propose solutions and identify risks! 💡

As you might know, cubesats are small satellites structured in cubic units, each 10 cm on a side. In this case, ITASAT 2 is composed of 16 such units, making it a total of 16U. 📊 The ITA has a rich history of developing and launching satellites for research and student training, including the ITASAT 1 and SPORT projects! 🚀

That's not all, folks! 😄 The CEI is also working on the SelenITA project, a CubeSat that will be part of NASA's Artemis mission to study the Moon! 🌕 Stay tuned for more updates on this exciting development in space technology! 🚀