In [26]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
import pandas as pd

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None


def download_article(url):
    retries = 3
    delay = 5
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except HTTPError as http_err:
            logger.error(f"HTTP error occurred for {url}: {http_err}")
        except Exception as err:
            logger.error(f"An error occurred for {url}: {err}")
        time.sleep(delay)
        delay *= 2
    return None


def scrape_news_from_feed(feed_url, limit=20):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)

    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break

        # Exclude YouTube links
        if "youtube.com" in entry.link:
            continue

        google_news_url = entry.link
        article_url = get_final_url(google_news_url)

        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)

            # Combine download methods
            if downloaded_article:
                article.set_html(downloaded_article)
            else:
                try:
                    article.download()
                except Exception as e:
                    logger.error(f"Failed to download article {article_url}: {str(e)}")
                    failed_parses.append(
                        {"title": entry.title, "feed_link": entry.link, "error": str(e)}
                    )
                    continue

            try:
                article.parse()
                publish_date = article.publish_date
                publish_date_source = "parsed"
                if publish_date is None and entry["published_parsed"] is not None:
                    publish_date = datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    )
                    publish_date_source = "approximated"
                elif publish_date is None:
                    publish_date = datetime.now()
                    publish_date_source = "current_time"

                publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M:%S")

                articles.append(
                    {
                        "title": article.title,
                        "text": article.text,
                        "publish_date": publish_date_str,
                        "publish_date_source": publish_date_source,
                        "authors": article.authors,
                        "canonical_link": article.canonical_link,
                        "feed_link": google_news_url,  # Original Google News link
                        "media_link": entry["source"]["href"],
                        "media_title": entry["source"]["title"],
                    }
                )
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            failed_parses.append(
                {
                    "title": entry.title,
                    "text": None,
                    "publish_date": datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    ).strftime("%Y-%m-%d %H:%M:%S")
                    if entry["published_parsed"] is not None
                    else datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "publish_date_source": "approximated"
                    if entry["published_parsed"] is not None
                    else "current_time",
                    "authors": None,
                    "canonical_link": None,
                    "feed_link": google_news_url,
                    "media_link": entry["source"]["href"],
                    "media_title": entry["source"]["title"],
                    "exception_class": None,
                    "exception_text": None,
                }
            )

    return articles, failed_parses


# Execute the scraper
feed_url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen"
limit = 20

articles, failed_parses = scrape_news_from_feed(feed_url, limit)

# Convert articles and failed_parses to DataFrames for easy analysis and saving
articles_df = pd.DataFrame(articles)
failed_parses_df = pd.DataFrame(failed_parses)

# Save to Excel for your analysis
articles_df.to_excel("new_articles_df.xlsx", index=False)
failed_parses_df.to_excel("new_failed_parses_df.xlsx", index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [27]:
articles_df

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title
0,The $8 billion Sam Bankman-Fried criminal tria...,"watch now\n\nA year ago, Sam Bankman-Fried was...",2023-10-03 00:00:00,parsed,[Mackenzie Sigalos],https://www.cnbc.com/2023/10/03/sam-bankman-fr...,https://news.google.com/rss/articles/CBMiZ2h0d...,https://www.cnbc.com,CNBC
1,The ‘wild bunch’ have taken control of the bon...,The bond market is front and center for invest...,2023-10-03 10:37:00,approximated,[Barbara Kollmeyer],https://www.marketwatch.com/story/the-wild-bun...,https://news.google.com/rss/articles/CBMihwFod...,https://www.marketwatch.com,MarketWatch
2,Rouble recovers after slide past 100 vs dollar...,A view shows a Russian one rouble coin in fron...,2023-10-03 00:00:00,parsed,"[Alexander Marrow, Thomson Reuters, Moscow-Bas...",https://www.reuters.com/markets/currencies/rus...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.reuters.com,Reuters
3,Chipotle tests robotic line to make your burri...,"More robots are coming to Chipotle (CMG), and ...",2023-10-03 12:05:30,approximated,[],https://finance.yahoo.com/news/chipotle-tests-...,https://news.google.com/rss/articles/CBMieGh0d...,https://finance.yahoo.com,Yahoo Finance
4,Powerball drawing: $1.04 billion jackpot winni...,CNN —\n\nGet your tickets ready: An estimated ...,2023-10-02 00:00:00,parsed,[Kelly Mccleary],https://www.cnn.com/2023/10/02/business/powerb...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.cnn.com,CNN
5,"Good news, homebuyers: This fall could finally...","US homebuyers might see a ""sweet spot"" in the ...",2023-10-03 00:00:00,parsed,[Huileng Tan],https://www.businessinsider.com/us-housing-mar...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.businessinsider.com,Business Insider
6,Welcome to the Great Internet Splintering,I've become a social-media ghost over the past...,2023-10-03 00:00:00,parsed,[Shubham Agarwal],https://www.businessinsider.com/social-media-s...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.businessinsider.com,Business Insider
7,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,2023-10-03 02:48:45,approximated,[],,https://news.google.com/rss/articles/CBMib2h0d...,https://www.bloomberg.com,Bloomberg
8,GM lays off 164 more workers due to UAW strike,General Motors has laid off more employees as ...,2023-10-02 18:43:17,approximated,[Breck Dumas],https://www.foxbusiness.com/economy/gm-lays-of...,https://news.google.com/rss/articles/CBMiRmh0d...,https://www.foxbusiness.com,Fox Business
9,Dow futures fall 100 points as Treasury yields...,Traders on the floor of the New York Stock Exc...,2023-10-02 00:00:00,parsed,"[Hakyung Kim Brian Evans, Hakyung Kim, Brian E...",https://www.cnbc.com/2023/10/02/stock-market-t...,https://news.google.com/rss/articles/CBMiRGh0d...,https://www.cnbc.com,CNBC


In [29]:
articles_df.loc[7]['title']

'Are you a robot?'

In [30]:
articles_df.loc[7]['text']

'Why did this happen?\n\nPlease make sure your browser supports JavaScript and cookies and that you are not blocking them from loading. For more information you can review our Terms of Service and Cookie Policy.'

In [31]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
import pandas as pd

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None


def download_article(url):
    retries = 3
    delay = 5
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except HTTPError as http_err:
            logger.error(f"HTTP error occurred for {url}: {http_err}")
        except Exception as err:
            logger.error(f"An error occurred for {url}: {err}")
        time.sleep(delay)
        delay *= 2
    return None


def scrape_news_from_feed(feed_url, limit=20):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)

    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break

        google_news_url = entry.link
        article_url = get_final_url(google_news_url)

        if not article_url:
            logger.error(f"Failed to resolve final URL for: {google_news_url}")
            failed_parses.append(
                {
                    "title": entry.title,
                    "text": None,
                    "publish_date": datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    ).strftime("%Y-%m-%d %H:%M:%S")
                    if entry["published_parsed"] is not None
                    else datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "publish_date_source": "approximated"
                    if entry["published_parsed"] is not None
                    else "current_time",
                    "authors": None,
                    "canonical_link": None,
                    "feed_link": google_news_url,
                    "media_link": entry["source"]["href"],
                    "media_title": entry["source"]["title"],
                    "exception_class": "URLResolutionError",
                    "exception_text": "Failed to resolve final URL",
                }
            )
            continue
        else:
            downloaded_article = download_article(article_url)
            article = Article(article_url)

            # Combine download methods
            if downloaded_article:
                article.set_html(downloaded_article)
            else:
                try:
                    article.download()
                except Exception as e:
                    logger.error(f"Failed to download article {article_url}: {str(e)}")
                    failed_parses.append(
                        {"title": entry.title, "feed_link": entry.link, "error": str(e)}
                    )
                    continue
            try:
                article.parse()

                # Validate essential fields for meaningful information
                if not all([article.title, article.text, article.canonical_link]):
                    raise ValueError(
                        "Essential fields are empty, possibly due to bot protection or bad parse"
                    )

                publish_date = article.publish_date
                publish_date_source = "parsed"
                if publish_date is None and entry["published_parsed"] is not None:
                    publish_date = datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    )
                    publish_date_source = "approximated"
                elif publish_date is None:
                    publish_date = datetime.now()
                    publish_date_source = "current_time"

                publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M:%S")

                articles.append(
                    {
                        "title": article.title,
                        "text": article.text,
                        "publish_date": publish_date_str,
                        "publish_date_source": publish_date_source,
                        "authors": article.authors,
                        "canonical_link": article.canonical_link,
                        "feed_link": google_news_url,  # Original Google News link
                        "media_link": entry["source"]["href"],
                        "media_title": entry["source"]["title"],
                    }
                )
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
                failed_parses.append(
                    {
                        "title": entry.title,
                        "text": None,
                        "publish_date": datetime.fromtimestamp(
                            mktime(entry["published_parsed"])
                        ).strftime("%Y-%m-%d %H:%M:%S")
                        if entry["published_parsed"] is not None
                        else datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "publish_date_source": "approximated"
                        if entry["published_parsed"] is not None
                        else "current_time",
                        "authors": None,
                        "canonical_link": None,
                        "feed_link": google_news_url,
                        "media_link": entry["source"]["href"],
                        "media_title": entry["source"]["title"],
                        "exception_class": type(err).__name__,
                        "exception_text": str(err),
                    }
                )

    return articles, failed_parses


# Execute the scraper
feed_url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen"
limit = 20

articles, failed_parses = scrape_news_from_feed(feed_url, limit)

# Convert articles and failed_parses to DataFrames for easy analysis and saving
articles_df = pd.DataFrame(articles)
failed_parses_df = pd.DataFrame(failed_parses)

# Save to Excel for your analysis
articles_df.to_excel("new_articles_df.xlsx", index=False)
failed_parses_df.to_excel("new_failed_parses_df.xlsx", index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [32]:
articles_df

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title
0,Sam Bankman-Fried heads for trial on charges o...,Companies Champion Trust Llc Follow\n\nNEW YOR...,2023-10-03 00:00:00,parsed,"[Jody Godoy Luc Cohen, Jody Godoy, Luc Cohen, ...",https://www.reuters.com/legal/sam-bankman-frie...,https://news.google.com/rss/articles/CBMia2h0d...,https://www.reuters.com,Reuters
1,The 6 wildest details about Sam Bankman-Fried ...,Michael Lewis gave a peek into details from hi...,2023-10-03 00:00:00,parsed,[Grace Kay],https://www.businessinsider.com/sam-bankman-fr...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.businessinsider.com,Business Insider
2,Hong Kong stocks have worst day in three month...,Hong Kong CNN —\n\nStocks in Hong Kong suffere...,2023-10-03 00:00:00,parsed,[Laura He],https://www.cnn.com/2023/10/03/investing/globa...,https://news.google.com/rss/articles/CBMiS2h0d...,https://www.cnn.com,CNN
3,"Winning Powerball numbers for Monday, Oct 2, 2...",The Powerball lottery jackpot continues to cli...,2023-10-02 00:00:00,parsed,[],https://www.indystar.com/story/news/2023/10/02...,https://news.google.com/rss/articles/CBMilQFod...,https://www.indystar.com,IndyStar
4,"Good news, homebuyers: This fall could finally...","US homebuyers might see a ""sweet spot"" in the ...",2023-10-03 00:00:00,parsed,[Huileng Tan],https://www.businessinsider.com/us-housing-mar...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.businessinsider.com,Business Insider
5,Great news — social media is falling apart,I've become a social-media ghost over the past...,2023-10-03 00:00:00,parsed,[Shubham Agarwal],https://www.businessinsider.com/social-media-s...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.businessinsider.com,Business Insider
6,Chipotle tests automation for burrito bowls an...,Chipotle Mexican Grill is testing whether auto...,2023-10-03 00:00:00,parsed,[Amelia Lucas],https://www.cnbc.com/2023/10/03/chipotle-tests...,https://news.google.com/rss/articles/CBMiXGh0d...,https://www.cnbc.com,CNBC
7,Rouble recovers after slide past 100 vs dollar...,A view shows a Russian one rouble coin in fron...,2023-10-03 00:00:00,parsed,"[Alexander Marrow, Thomson Reuters, Moscow-Bas...",https://www.reuters.com/markets/currencies/rus...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.reuters.com,Reuters
8,"Strong U.S. dollar, rising Treasury yields kee...",(Kitco News) - Gold and silver prices are lowe...,2023-10-03 00:00:00,parsed,"[Http, Www.Facebook.Com Kitconews]",https://www.kitco.com/news/2023-10-03/Strong-U...,https://news.google.com/rss/articles/CBMia2h0d...,https://www.kitco.com,Kitco NEWS
9,"Tesla reported 435,059 deliveries for the thir...",Tesla vehicles waiting to be loaded on board a...,2023-10-02 00:00:00,parsed,[Lora Kolodny],https://www.cnbc.com/2023/10/02/tesla-tsla-q3-...,https://news.google.com/rss/articles/CBMiX2h0d...,https://www.cnbc.com,CNBC


In [33]:
failed_parses_df

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,exception_class,exception_text
0,"Stock Market Today: Dow, S&P Live Updates for ...",,2023-10-02 22:10:28,approximated,,,https://news.google.com/rss/articles/CBMicWh0d...,https://www.bloomberg.com,Bloomberg,ValueError,"Essential fields are empty, possibly due to bo..."
1,Tesla Sales Drop Allows BYD to Close In - Bloo...,,2023-10-03 03:24:05,approximated,,,https://news.google.com/rss/articles/CCAiC3ZUV...,https://www.youtube.com,Bloomberg Television,ValueError,"Essential fields are empty, possibly due to bo..."
2,Abercrombie & Fitch launches investigation int...,,2023-10-03 10:50:16,approximated,,,https://news.google.com/rss/articles/CBMiJ2h0d...,https://www.bbc.com,BBC,ArticleException,Article `download()` failed with HTTPSConnecti...


# Final script

In [12]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
import pandas as pd

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None


def download_article(url):
    retries = 3
    delay = 5
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except HTTPError as http_err:
            logger.error(f"HTTP error occurred for {url}: {http_err}")
        except Exception as err:
            logger.error(f"An error occurred for {url}: {err}")
        time.sleep(delay)
        delay *= 2
    return None


def scrape_news_from_feed(feed_url, limit=20):
    all_articles = []
    feed = feedparser.parse(feed_url)

    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break

        google_news_url = entry.link
        article_url = get_final_url(google_news_url)

        data = {
            "title": entry.title,
            "text": None,
            "publish_date": None,
            "publish_date_source": None,
            "authors": None,
            "canonical_link": None,
            "feed_link": google_news_url,
            "media_link": entry["source"]["href"],
            "media_title": entry["source"]["title"],
            "is_parsed": False,
            "exception_class": None,
            "exception_text": None,
        }

        if not article_url:
            logger.error(f"Failed to resolve final URL for: {google_news_url}")
            data.update(
                {
                    "exception_class": "URLResolutionError",
                    "exception_text": "Failed to resolve final URL",
                }
            )
            all_articles.append(data)
            continue

        downloaded_article = download_article(article_url)
        article = Article(article_url)

        if downloaded_article:
            article.set_html(downloaded_article)
        else:
            try:
                article.download()
            except Exception as e:
                logger.error(f"Failed to download article {article_url}: {str(e)}")
                data.update(
                    {
                        "exception_class": type(e).__name__,
                        "exception_text": str(e),
                    }
                )
                all_articles.append(data)
                continue

        try:
            article.parse()

            if not all([article.title, article.text, article.canonical_link]):
                raise ValueError(
                    "Essential fields are empty, possibly due to bot protection or bad parse"
                )

            publish_date = article.publish_date
            publish_date_source = "parsed"
            if publish_date is None and entry["published_parsed"] is not None:
                publish_date = datetime.fromtimestamp(time.mktime(entry["published_parsed"]))
                publish_date_source = "approximated"
            elif publish_date is None:
                publish_date = datetime.now()
                publish_date_source = "current_time"

            publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M:%S")

            data.update(
                {
                    "title": article.title,
                    "text": article.text,
                    "publish_date": publish_date_str,
                    "publish_date_source": publish_date_source,
                    "authors": article.authors,
                    "canonical_link": article.canonical_link,
                    "is_parsed": True,
                }
            )

        except Exception as err:
            logger.error(f"An unexpected error occurred for {article_url}: {err}")
            data.update(
                {
                    "exception_class": type(err).__name__,
                    "exception_text": str(err),
                }
            )

        all_articles.append(data)

    return all_articles


# Execute the scraper
feed_url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen"
limit = 20

articles = scrape_news_from_feed(feed_url, limit)

articles_df = pd.DataFrame(articles)

# Save to Excel for your analysis
articles_df.to_excel("unified_articles_df.xlsx", index=False)

ModuleNotFoundError: No module named 'newspaper3k'

In [5]:
articles_df

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text
0,"Stock Market Today: Dow, S&P Live Updates for ...",,,,,,https://news.google.com/rss/articles/CBMiWmh0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo..."
1,Toyota recalls 1.8 million RAV4 SUVs over fire...,,,,,,https://news.google.com/rss/articles/CBMiT2h0d...,https://www.washingtonpost.com,The Washington Post,False,URLResolutionError,Failed to resolve final URL
2,Disney reaches $8.6 billion deal with Comcast ...,The Walt Disney Company on Wednesday announced...,2023-11-01 22:49:00,approximated,[],https://www.cbsnews.com/news/disney-8-6-billio...,https://news.google.com/rss/articles/CBMiXGh0d...,https://www.cbsnews.com,CBS News,True,,
3,Europe stocks up 1.3% as investors react posit...,Still life of Wegovy an injectable prescriptio...,2023-11-02 00:00:00,parsed,"[Jenni Reid Elliot Smith Holly Ellyatt, Jenni ...",https://www.cnbc.com/2023/11/02/european-marke...,https://news.google.com/rss/articles/CBMiZWh0d...,https://www.cnbc.com,CNBC,True,,
4,"Shell posts $6.2 billion third-quarter profit,...",People pump gas into their vehicles at a Shell...,2023-11-02 00:00:00,parsed,[Jenni Reid],https://www.cnbc.com/2023/11/02/shell-posts-6p...,https://news.google.com/rss/articles/CBMiaWh0d...,https://www.cnbc.com,CNBC,True,,
5,Dow Jones Futures: Market Rally Gains Steam On...,,,,,,https://news.google.com/rss/articles/CBMilAFod...,https://www.investors.com,Investor's Business Daily,False,ArticleException,Article `download()` failed with 403 Client Er...
6,Roku Inc (ROKU) Reports 20% YoY Growth in Tota...,Roku Inc (NASDAQ:ROKU) reports a 20% YoY incre...,2023-11-01 22:36:41,approximated,[Gurufocus Research],https://finance.yahoo.com/news/roku-inc-roku-r...,https://news.google.com/rss/articles/CBMiRmh0d...,https://finance.yahoo.com,Yahoo Finance,True,,
7,Why Wall Street Is So Worried About ‘Refunding’,Investors have fixated this week on a routine ...,2023-11-01 00:00:00,parsed,"[Joe Rennison, More About Joe Rennison]",https://www.nytimes.com/2023/11/01/business/tr...,https://news.google.com/rss/articles/CBMiQ2h0d...,https://www.nytimes.com,The New York Times,True,,
8,Opinion: Super Micro stock has surged this yea...,"Super Micro Computer Inc., a longtime Nvidia C...",2023-11-02 01:36:00,approximated,[Therese Poletti],https://www.marketwatch.com/story/this-nvidia-...,https://news.google.com/rss/articles/CBMijQFod...,https://www.marketwatch.com,MarketWatch,True,,
9,"A Year After Sam Bankman-Fried's Downfall, Sol...","On Nov. 2, 2022, CoinDesk published a now-awar...",2023-11-02 00:00:00,parsed,[],https://www.coindesk.com/business/2023/11/02/a...,https://news.google.com/rss/articles/CBMihAFod...,https://www.coindesk.com,CoinDesk,True,,


In [6]:
articles_df[articles_df['is_parsed'] == False]

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text
0,"Stock Market Today: Dow, S&P Live Updates for ...",,,,,,https://news.google.com/rss/articles/CBMiWmh0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo..."
1,Toyota recalls 1.8 million RAV4 SUVs over fire...,,,,,,https://news.google.com/rss/articles/CBMiT2h0d...,https://www.washingtonpost.com,The Washington Post,False,URLResolutionError,Failed to resolve final URL
5,Dow Jones Futures: Market Rally Gains Steam On...,,,,,,https://news.google.com/rss/articles/CBMilAFod...,https://www.investors.com,Investor's Business Daily,False,ArticleException,Article `download()` failed with 403 Client Er...
13,FDA warns consumers against using 27 different...,,,,,,https://news.google.com/rss/articles/CCAiC1lXV...,https://www.youtube.com,NBC News,False,ValueError,"Essential fields are empty, possibly due to bo..."


In [11]:
(
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/91.0.4472.124 Safari/537.36"
)

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

In [9]:
# Required libraries, data, and regular expression patterns

import re

# Sample exception strings
exceptions = [
    "403 Client Error: Forbidden for url: https://www.axios.com/2023/09/20/general-motors-stellantis-uaw-strike",
    "403 Client Error: Forbidden for url: https://www.sfgate.com/tech/article/instacart-ceo-ipo-stock-mehta-18378600.php",
    "403 Client Error: Forbidden for url: https://www.thestreet.com/automotive/general-motors-delivers-hard-nosed-message-to-uaw-workers",
    "403 Client Error: Forbidden for url: https://www.wsj.com/finance/stocks/klaviyo-shares-surge-in-trading-debut-6928fff2",
    "429 Client Error: Unknown Error for url: https://thehill.com/homenews/education/4214939-biden-administration-cancels-37-million-in-student-loans-for-former-university-of-phoenix-students/",
    "HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=7)",
    "403 Client Error: Forbidden for url: https://www.wsj.com/business/elon-musk-spacex-sues-justice-department-hiring-discrimination-edd38f7e",
    "403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/anafaguy/2023/09/20/uber-eats-will-accept-food-stamps-beginning-next-year/",
    "403 Client Error: Forbidden for url: https://www.thestreet.com/memestocks/amc/amc-stock-the-3-strongest-buy-signals",
    "403 Client Error: Forbidden for url: https://www.wsj.com/economy/central-banking/federal-reserve-powell-interest-rates-ba600bf0",
    "403 Client Error: Forbidden for url: https://www.wsj.com/business/earnings/general-mills-gis-q1-earnings-report-2024-194d129f",
    "('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))"
]

# Regular expression patterns
text_without_url_pattern = r'^(.*)(?=https?:\/\/)'
url_pattern = r'https?:\/\/[^\s]+'

# Updated extraction logic
output = []

for exc in exceptions:
    # Extract URL
    url_match = re.search(url_pattern, exc)
    
    if url_match:
        # Extract text without a URL
        text_match = re.search(text_without_url_pattern, exc)
        if text_match:
            output.append(("Text Without URL:", text_match.group().strip()))
        output.append(("URL:", url_match.group()))
    else:
        output.append(("Text Without URL:", exc.strip()))
    
    output.append('-'*50)

output


[('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.axios.com/2023/09/20/general-motors-stellantis-uaw-strike'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.sfgate.com/tech/article/instacart-ceo-ipo-stock-mehta-18378600.php'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.thestreet.com/automotive/general-motors-delivers-hard-nosed-message-to-uaw-workers'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.wsj.com/finance/stocks/klaviyo-shares-surge-in-trading-debut-6928fff2'),
 '--------------------------------------------------',
 ('Text Without URL:', '429 Client Error: Unknown Error for url:'),
 ('URL:',
  'https://thehill.com/homenews/education/4214939-biden-