In [6]:
# Import Libraries
import os
import requests
import ollama
from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException

from IPython.display import Markdown, display

# Load .env variables
ollama_api = "http://localhost:11434/api/chat"
headers = {"Content-Type": "application/json"}
model = "llama3"

# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
    """
    def __init__(self, url: str, summary_detail: str = "high", show_summary: bool = True, language_of_reference = "English", model: str = "gpt-4o-mini") -> None:

        # Initial summarizer settings
        self.url = url
        self.model = model
        self.show_summary = show_summary
        self.summary_detail = summary_detail
        self.language_of_reference = language_of_reference
        self.language_code_map = {
            "english": "en",
            "portuguese": "pt",
            "spanish": "es",
            "french": "fr",
            "german": "de",
            "italian": "it",
            "japanese": "ja",
            "chinese": "zh",
            "korean": "ko",
        }
        
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        if self.summary_detail not in ["high", "low"]:
            raise Exception("""Please select summary detail as either "high" or "low".""")

    def __extract_text(self):
        response = requests.get(self.url, headers=self.headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
            
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        # Guarantee limit of text to summary
        max_words = 7000
        if len(self.text.split()) > max_words:
            self.text = " ".join(self.text.split()[:max_words])
    
    def __detect_language(self):
        # Detect language
        try:
            self.language_url = detect(self.text)
        except LangDetectException:
            self.language_url = "unknown"

        # Normalize and resolve target language code
        target_language_name = self.language_of_reference.lower().strip()
        self.target_language_code = self.language_code_map.get(target_language_name)
        
        if not self.target_language_code:
            raise ValueError(f"‚ùå Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}")

        print(f"üåç Detected language: {self.language_url.upper()}")
        
        if self.show_summary:
            print("üîó Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}\n")

    def summarize(self)-> str:
        """
        Method to process user prompts in the context of the user.
        """
        self.__extract_text()
        self.__detect_language()
        
        # Prompt for system definition
        self.system_prompt = f""" 
        You are an assistant that analyzes the contents of a website and provides a summary. 
        Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. 
        Also, can you please start your summary with the tile "üìù Summary"?
        
        Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. 
        """

        self.content = f"""The text to summarize is as follows: {self.text}"""

        if self.language_url != self.target_language_code:
            self.system_prompt = f"""The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. 
            {self.system_prompt.strip()}
            """

        messages=[{"role":"system", "content":self.system_prompt}, 
                  {"role": "user",  "content":self.content}]

        response = ollama.chat(model = model, messages = messages)

        # Cost calculation and usage report
        return response["message"]["content"]


web_page_summarizer = WebPageSummarizer("http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada", summary_detail = "low")
display(Markdown(web_page_summarizer.summarize()))

üåç Detected language: PT
üîó Preview of extracted text:

ITASAT2 ir√° atuar para aplica√ß√µes cient√≠ficas e de defesa
Publicado em 14/04/2025 - 14h15
O Instituto Tecnol√≥gico de Aeron√°utica (ITA) realizou, entre os dias 17 e 19 de mar√ßo, a Revis√£o Preliminar de Projeto (PDR) do ITASAT 2, novo microssat√©lite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presen√ßa de institui√ß√µes parceiras, tanto do Brasil quanto do exterior.
Participaram do encontro representantes do
...

Amount of words: 526



üìù Summary
Oh boy, are you ready for some exciting news?! The Instituto Tecnol√≥gico de Aeron√°utica (ITA) just had a crucial meeting to review the project of their new microsatellite, ITASAT2! This tiny but mighty satellite will be used for scientific applications and defense purposes. Let me give you the lowdown:

ITASAT2 is a constellation of three cubesats that will monitor the Earth's ionosphere, tracking plasma bubbles and identifying non-collaborative vessels. It's scheduled to launch in the next few years! The meeting was attended by representatives from various organizations, including the Brazilian government, NASA, and more.

The ITASAT2 project is a huge deal, requiring expertise from multiple engineering fields, such as orbital mechanics, control systems, propulsion, structures, and thermal management. After a thorough review, the project received unanimous approval to move forward!

One of the team members, D√©bora Santos, described the meeting as "extremely productive" for both the reviewers and the team. She praised the well-structured presentations and the team's commitment and competence.

Now, let me tell you about cubesats! They're tiny satellites that come in units called 1U, which measures 10 cm on each side. ITASAT2 is a 16U cube sat, composed of 16 of these units. The ITA has a rich history of developing satellites, including the ITASAT1, launched in 2018, and the SPORT, launched in 2022.

The CEI is also working on another exciting project called SelenITA, which will be part of NASA's Artemis mission to study the Moon!

That's it for now!