<a href="https://colab.research.google.com/github/shravanisaraf/two-minute-reads-summarizer/blob/main/testing_all_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''1  minute 40 seconds'''
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            return f"**Page Title:** {page_title}\n\n**Content:**\n{content}"
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return f"An error occurred: {e}"

def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

def summarize_text(text, model, tokenizer):
    max_length = 1024  # Max length for BART
    inputs = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=max_length, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def chunk_text(text, chunk_size=1000):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

def main():
    # Load the BART model and tokenizer
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        content = fetch_url_content(url)

        if "Content" in content:
            # Extract the text content only
            text_content = content.split("**Content:**\n", 1)[1]

            # Summarize in chunks if text is too long
            summaries = []
            for chunk in chunk_text(text_content):
                logging.info("Generating summary for a chunk of text...")
                summary = summarize_text(chunk, model, tokenizer)
                summaries.append(summary)

            final_summary = ' '.join(summaries)
            logging.info("Summary generated.")
            print(f"Summary:\n{final_summary}\n")
        else:
            print(content)

        time.sleep(1)


if __name__ == "__main__":
    main()


Enter the URL (or type 'exit' to quit): https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/
Summary:
DiscoverDataScience.org created byaasif.faizal It’s important to determine what type of data science career you’d like at the outset of your educational journey. One excellent area of specialization within the big data industry iscybersecurity. There are two primary types of cybersecurity specialists:security analysts and security engineers. Cybersecurity analysts and engineers draw from similar bodies of knowledge. Both require the ability to think ahead and imagine what the objectives and tactics of cyberattackers might be. There are terrific career options in the world of cybersecurity for candidates holding either degree.

Enter the URL (or type 'exit' to quit): exit


In [None]:
'''1 min 6 seconds'''
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            return page_title, content
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return None, f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return None, f"An error occurred: {e}"

def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

def summarize_text(text, model, tokenizer):
    max_length = 1024  # Max length for BART
    inputs = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=max_length, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def chunk_text(text, chunk_size=1000):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

def main():
    # Load the BART model and tokenizer
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        page_title, content = fetch_url_content(url)

        if content and "Failed to retrieve URL" not in content and "Invalid URL format" not in content:
            # Summarize in chunks if text is too long
            summaries = []
            for chunk in chunk_text(content):
                logging.info("Generating summary for a chunk of text...")
                summary = summarize_text(chunk, model, tokenizer)
                summaries.append(summary)

            final_summary = ' '.join(summaries)
            logging.info("Summary generated.")
            print(f"\n{'='*40}")
            print(f"Title: {page_title}\n")
            print(f"Content:\n{content[:1000]}{'...' if len(content) > 1000 else ''}\n")
            print(f"Summary:\n{final_summary}\n")
            print(f"{'='*40}\n")
        else:
            print(content)

        time.sleep(1)

if __name__ == "__main__":
    main()


Enter the URL (or type 'exit' to quit):  https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/

Title: Cyber Security Analyst vs Cyber Security Engineer | Salary

Content:
DiscoverDataScience.org
Created byaasif.faizal
It’s important to determine what type of data science career you’d like at the outset of your educational journey, recognizing that you’re committing to an area of focus that you’ll be enthusiastic to pursue in years to come.

Included in this Article:
What is Cybersecurity?
Cyber Security Analyst and Cyber Security Engineer
Cybersecurity Analyst vs. Engineer – Which One is Right for You?
One excellent area of specialization within the big data industry iscybersecurity. For obvious reasons, everyone is concerned with protecting their digital archives, documents, and correspondences, from government agencies and medical facilities to companies large and small to even private individuals. If you are looking to cultivate a skill set that will never

In [None]:
'''pegasus --> 1 min 30 seconds'''
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Summarization function using Pegasus
def summarize_text(text):
    model_name = "google/pegasus-xsum"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = model.generate(tokens["input_ids"], max_length=60, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# URL validation
def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

# Fetch URL content
def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            return f"**Page Title:** {page_title}\n\n**Content:**\n{content}"
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return f"An error occurred: {e}"

# Check robots.txt permissions
def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

def main():
    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        content = fetch_url_content(url)

        if "Content" in content:
            start_idx = content.find("**Content:**") + len("**Content:**\n")
            text_content = content[start_idx:]
            summary = summarize_text(text_content)
            print(f"Summary:\n{summary}")
        else:
            print(content)

        time.sleep(1)

if __name__ == "__main__":
    main()


Enter the URL (or type 'exit' to quit): https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Summary:
If you’re considering a career in the data science field, there are a number of excellent areas of study that you could consider.
Enter the URL (or type 'exit' to quit): exit


In [None]:
'''51 seconds'''
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
from transformers import BartTokenizer, BartForConditionalGeneration

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            # Summarize the content
            summarized_content = summarize_content(content)

            return f"**Page Title:** {page_title}\n\n**Original Content:**\n{content}\n\n**Summarized Content:**\n{summarized_content}"
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return f"An error occurred: {e}"

def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

def summarize_content(content):
    try:
        inputs = tokenizer.encode("summarize: " + content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summarized = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summarized
    except Exception as e:
        logging.error(f"An error occurred during summarization: {e}")
        return "An error occurred during summarization."

def main():
    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        content = fetch_url_content(url)

        print(content)
        time.sleep(1)

if __name__ == "__main__":
    main()


Enter the URL (or type 'exit' to quit): https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/
**Page Title:** Cyber Security Analyst vs Cyber Security Engineer | Salary

**Original Content:**
DiscoverDataScience.org
Created byaasif.faizal
It’s important to determine what type of data science career you’d like at the outset of your educational journey, recognizing that you’re committing to an area of focus that you’ll be enthusiastic to pursue in years to come.

Included in this Article:
What is Cybersecurity?
Cyber Security Analyst and Cyber Security Engineer
Cybersecurity Analyst vs. Engineer – Which One is Right for You?
One excellent area of specialization within the big data industry iscybersecurity. For obvious reasons, everyone is concerned with protecting their digital archives, documents, and correspondences, from government agencies and medical facilities to companies large and small to even private individuals. If you are looking to cultivate a skill

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
from transformers import BartTokenizer, BartForConditionalGeneration

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            # Summarize the content
            summarized_content = summarize_content(content)

            return f"**Page Title:** {page_title}\n\n**Original Content:**\n{content}\n\n**Summarized Content:**\n{summarized_content}"
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return f"An error occurred: {e}"

def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

def summarize_content(content):
    try:
        # Adjusted max_length for BART
        inputs = tokenizer.encode("summarize: " + content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summarized = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summarized
    except Exception as e:
        logging.error(f"An error occurred during summarization: {e}")
        return "An error occurred during summarization."

def main():
    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        content = fetch_url_content(url)
        print(content)
        time.sleep(1)

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Enter the URL (or type 'exit' to quit): https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/
**Page Title:** Cyber Security Analyst vs Cyber Security Engineer | Salary

**Original Content:**
DiscoverDataScience.org
Created byaasif.faizal
It’s important to determine what type of data science career you’d like at the outset of your educational journey, recognizing that you’re committing to an area of focus that you’ll be enthusiastic to pursue in years to come.

Included in this Article:
What is Cybersecurity?
Cyber Security Analyst and Cyber Security Engineer
Cybersecurity Analyst vs. Engineer – Which One is Right for You?
One excellent area of specialization within the big data industry iscybersecurity. For obvious reasons, everyone is concerned with protecting their digital archives, documents, and correspondences, from government agencies and medical facilities to companies large and small to even private individuals. If you are looking to cultivate a skill

In [None]:
'''1 min 24 seconds --> 34 seconds'''
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import time
from urllib.robotparser import RobotFileParser
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_valid_url(url):
    parsed_url = urllib.parse.urlparse(url)
    return bool(parsed_url.scheme and parsed_url.netloc)

def fetch_url_content(url):
    try:
        if not is_valid_url(url):
            logging.error("Invalid URL format. Please include a valid scheme (http or https).")
            return "Invalid URL format. Please include a valid scheme (http or https)."

        if not is_allowed_by_robots(url):
            logging.warning("URL is disallowed by robots.txt.")
            return "Access to this URL is disallowed by robots.txt."

        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebCrawler/1.0)'}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('title')
            page_title = title.text.strip() if title else "No title found"

            paragraphs = soup.find_all('p')

            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            return f"**Page Title:** {page_title}\n\n**Content:**\n{content}"
        else:
            logging.error(f"Failed to retrieve URL. HTTP Status Code: {response.status_code}")
            return f"Failed to retrieve URL. HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred: {e}")
        return f"An error occurred: {e}"

def is_allowed_by_robots(url):
    parsed_url = urllib.parse.urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch('*', url)

# Load pre-trained T5 model for summarization
model_name = "t5-base"  # Adjust for different model sizes (e.g., "t5-small", "t5-large")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def summarize_article(article_text, max_length=150, num_beams=5, no_repeat_ngram_size=3):
    """
    Summarizes an article using abstractive summarization with T5.

    Args:
        article_text (str): The text of the article to summarize.
        max_length (int): The maximum length of the generated summary (default: 150 words).
        num_beams (int): Number of beams for beam search (default: 5).
        no_repeat_ngram_size (int): Size of ngrams that should not be repeated in the summary (default: 3).

    Returns:
        str: The generated summary of the article.
    """
    try:
        # Preprocess text
        article_text = article_text.strip()  # Remove leading/trailing whitespace

        # Encode the article for T5
        inputs = tokenizer(article_text, return_tensors="pt", max_length=512, truncation=True)

        # Generate summary
        summary_ids = model.generate(
            **inputs, max_length=max_length, num_beams=num_beams, no_repeat_ngram_size=no_repeat_ngram_size, early_stopping=True
        )

        # Decode summary tokens back to text
        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary_text

    except Exception as e:
        logging.error(f"An error occurred during summarization: {e}")
        return None

def main():
    while True:
        url = input("Enter the URL (or type 'exit' to quit): ")
        if url.lower() == 'exit':
            logging.info("Exiting the URL fetcher.")
            break
        logging.info(f"Fetching content from URL: {url}")
        content = fetch_url_content(url)

        if "Content:" in content:
            # Extract the actual content part for summarization
            content_text = content.split("**Content:**\n", 1)[-1]
            summary = summarize_article(content_text)
            print(f"Summary:\n{summary}")
        else:
            print(content)

        time.sleep(1)

if __name__ == "__main__":
    main()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Enter the URL (or type 'exit' to quit): https://www.discoverdatascience.org/articles/cybersecurity-analyst-vs-engineer/
Summary:
there are many different types of threats to data, including the following: These are just a few of the types of cybersecurity threats that analysts and engineers must understand so that they can keep businesses’ private information out of the wrong hands. the BLS reports a median annual wage of $102,600 – a six-figure income likely to spark anyone’s attention.
Enter the URL (or type 'exit' to quit): exit
