In [98]:
# Import Libraries
import os
import requests
from openai import OpenAI

from bs4 import BeautifulSoup
from dotenv import load_dotenv

from IPython.display import Markdown, display

In [99]:
# Load .env variables
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not openai_api_key:
    raise ValueError("⚠️ OPENAI_API_KEY not found in .env file.")

In [100]:
# Generating object to work with GPT tasks 
openai = OpenAI()

In [101]:
# Checking API response
message = "Hello, GPT! This is a test message to make sure my API works."
response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

Hello! It looks like your API is working just fine. How can I assist you further?


In [102]:
# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    def __init__(self, url: str, show_summary: bool = True):
        """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        self.url = None
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
        else:
            self.url = url
            
        soup = BeautifulSoup(response.text, "html.parser")
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        if show_summary:
            print("🔗 Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}")

    def summarize(self):
        """Method to process user prompts in the context of the user."""
        
        # Prompt for system definition
        self.system_prompt = """ You are an assistant that analyzes the contents of a website and provides a summary. 
        The detail of the summary may depend on the content you may find. Expect to find both complex but also simple contents.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. Also, can you please start your summary 
        with the tile "📝 Summary"?
        
        If possible, please show cheerful behavior during your summary, something like during summary: "Furthermore, it builds Symmetric Trees (or Oblivious Trees) 
        to handle splits uniformly at each tree level, enhancing performance and interpretability." and then you commne something like: "Isn't that interesting?". 
        Thanks for your help!"""

        self.content = f"""The text to summarize is as follows: {self.text}"""

        response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"system", "content":self.system_prompt}, 
                                                                                 {"role": "user",  "content":self.content}])
        print(response.choices[0].message.content)

In [103]:
web_page_summarizer = WebPageSummarizer("https://deep-and-shallow.com/2020/02/29/the-gradient-boosters-v-catboost/")

🔗 Preview of extracted text:

Deep & Shallow
While XGBoost and LightGBM reigned the ensembles in Kaggle competitions, another contender took its birth in Yandex, the Google from Russia. It decided to take the path less tread, and took a different approach to Gradient Boosting. They sought to fix a key problem, as they see it, in all the other GBMs in the world.
Let’s take a look at what made it different:
Let’s take a look at the innovation which gave the algorithm it’s name – CatBoost. Unlike XGBoost, CatBoost deals with Ca
...

Amount of words: 1615


In [104]:
web_page_summarizer.summarize()

# 📝 Summary

CatBoost, developed by Yandex, takes a fresh and innovative approach to Gradient Boosting, particularly by handling categorical variables natively, which differentiates it from competitors like XGBoost and LightGBM. Isn't that fascinating? Unlike traditional methods that often struggle with high cardinality categorical features, CatBoost embraces the concept of Ordered Target Statistics. This helps mitigate issues of target leakage, ensuring that the model doesn't overfit to the training data by treating target statistics sequentially in a creatively constructed "artificial time".

The authors highlight the problems of traditional gradient boosting, such as the tendency to reuse the same dataset for each iteration, leading to target leakage. CatBoost introduces a practical solution through Ordered Boosting — generating multiple permutations of the dataset to create a robust model that reduces variance in predictions. 

Moreover, CatBoost builds Symmetric Trees (or Obliviou

In [123]:
# Import Libraries
import os
import requests
from openai import OpenAI

from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException
from dotenv import load_dotenv

from IPython.display import Markdown, display

# Load .env variables
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not openai_api_key:
    raise ValueError("⚠️ OPENAI_API_KEY not found in .env file.")

# Generating object to work with GPT tasks 
openai = OpenAI()

# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    def __init__(self, url: str, summary_detail: str = "high", show_summary: bool = True, language_of_reference = "English") -> None:
        """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
        """

        LANGUAGE_CODE_MAP = {
            "english": "en",
            "portuguese": "pt",
            "spanish": "es",
            "french": "fr",
            "german": "de",
            "italian": "it",
            "japanese": "ja",
            "chinese": "zh",
            "korean": "ko",
        }

        self.summary_detail = summary_detail
        self.url = None
        self.language_of_reference = language_of_reference
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }
        
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
        else:
            self.url = url

        if self.summary_detail not in ["high", "low"]:
            raise Exception("""Please select summary detail as either "high" or "low". """)
            
        soup = BeautifulSoup(response.text, "html.parser")
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        # Guarantee limit of text to summary
        max_words = 7000
        if len(self.text.split()) > max_words:
            self.text = " ".join(self.text.split()[:max_words])

        try:
            self.language_url = detect(self.text)
        except LangDetectException:
            self.language_url = "unknown"

        # Normalize and resolve target language code
        target_language_name = self.language_of_reference.lower().strip()
        self.target_language_code = LANGUAGE_CODE_MAP.get(target_language_name)
        
        if not self.target_language_code:
            raise ValueError(f"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}")

        print(f"🌍 Detected language: {self.language_url.upper()}")
        
        if show_summary:
            print("🔗 Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}")

    def summarize(self)-> str:
        """Method to process user prompts in the context of the user."""
        
        # Prompt for system definition
        self.system_prompt = f""" 
        You are an assistant that analyzes the contents of a website and provides a summary. 
        Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. Also, can you please start your summary 
        with the tile "📝 Summary"?
        
        Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. 
        """

        self.content = f"""The text to summarize is as follows: {self.text}"""

        if self.language_url != self.target_language_code:
            self.system_prompt = f"""The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. 
            {self.system_prompt.strip()}
            """

        response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"system", "content":self.system_prompt}, 
                                                                                 {"role": "user",  "content":self.content}])
        return response.choices[0].message.content


web_page_summarizer = WebPageSummarizer("http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada", summary_detail = "low")
print(web_page_summarizer.summarize())

🌍 Detected language: PT
🔗 Preview of extracted text:

ITASAT2 irá atuar para aplicações científicas e de defesa
Publicado em 14/04/2025 - 14h15
O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.
Participaram do encontro representantes do
...

Amount of words: 526
# 📝 Summary

Exciting news from the Instituto Tecnológico de Aeronáutica (ITA)! They recently conducted a Preliminary Design Review (PDR) for their new microsatellite, ITASAT 2, which is set to focus on scientific and defense applications! 🎉 The PDR took place from March 17 to 19 and involved a collaboration of international partners, including NASA and the U.S. Naval Research Laboratory, showcasing the project's br

In [128]:
# Import Libraries
import os
import requests
from openai import OpenAI

from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException
from dotenv import load_dotenv

from IPython.display import Markdown, display

# Load .env variables
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not openai_api_key:
    raise ValueError("⚠️ OPENAI_API_KEY not found in .env file.")

# Generating object to work with GPT tasks 
openai = OpenAI()

# Class to work with text extraction, processing and summarizing from a given url
class WebPageSummarizer():
    """
        Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library.
    """
    def __init__(self, url: str, summary_detail: str = "high", show_summary: bool = True, language_of_reference = "English") -> None:

        # Initial summarizer settings
        self.url = url
        self.show_summary = show_summary
        self.summary_detail = summary_detail
        self.language_of_reference = language_of_reference
        self.language_code_map = {
            "english": "en",
            "portuguese": "pt",
            "spanish": "es",
            "french": "fr",
            "german": "de",
            "italian": "it",
            "japanese": "ja",
            "chinese": "zh",
            "korean": "ko",
        }
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        if self.summary_detail not in ["high", "low"]:
            raise Exception("""Please select summary detail as either "high" or "low".""")

    def __extract_text(self):
        response = requests.get(self.url, headers=self.headers)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch page. Status code: {response.status_code}")
            
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        
        # Join all paragraph text
        self.text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip() != ""])

        # Guarantee limit of text to summary
        max_words = 7000
        if len(self.text.split()) > max_words:
            self.text = " ".join(self.text.split()[:max_words])
    
    def __detect_language(self):
        # Detect language
        try:
            self.language_url = detect(self.text)
        except LangDetectException:
            self.language_url = "unknown"

        # Normalize and resolve target language code
        target_language_name = self.language_of_reference.lower().strip()
        self.target_language_code = self.language_code_map.get(target_language_name)
        
        if not self.target_language_code:
            raise ValueError(f"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}")

        print(f"🌍 Detected language: {self.language_url.upper()}")
        
        if self.show_summary:
            print("🔗 Preview of extracted text:\n")
            print(self.text[:500] + "\n...\n")
            print(f"Amount of words: {len(self.text.split())}\n\n")

    def summarize(self)-> str:
        """Method to process user prompts in the context of the user."""
        self.__extract_text()
        self.__detect_language()
        
        # Prompt for system definition
        self.system_prompt = f""" 
        You are an assistant that analyzes the contents of a website and provides a summary. 
        Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.
        If you find text that might be navigation related or ad related please ignore. Respond in markdown. Also, can you please start your summary 
        with the tile "📝 Summary"?
        
        Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. 
        """

        self.content = f"""The text to summarize is as follows: {self.text}"""

        if self.language_url != self.target_language_code:
            self.system_prompt = f"""The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. 
            {self.system_prompt.strip()}
            """

        response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"system", "content":self.system_prompt}, 
                                                                                 {"role": "user",  "content":self.content}])
        return response.choices[0].message.content


web_page_summarizer = WebPageSummarizer("http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada", summary_detail = "low")
print(web_page_summarizer.summarize())

🌍 Detected language: PT
🔗 Preview of extracted text:

ITASAT2 irá atuar para aplicações científicas e de defesa
Publicado em 14/04/2025 - 14h15
O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.
Participaram do encontro representantes do
...

Amount of words: 526


# 📝 Summary

The ITASAT2 project, undertaken by the Aeronautical Institute of Technology (ITA), focuses on scientific and defense applications! Between March 17-19, 2025, a Preliminary Design Review (PDR) was conducted, a crucial step involving both Brazilian and international partners like NASA and the U.S. Southern Command. Excitingly, ITASAT 2 will be a constellation of three CubeSats aimed at monitoring the Ea