In [None]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
import pandas as pd

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('app.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url  
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None

def download_article(url):
    retries = 3
    delay = 5
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    headers = {'User-Agent': user_agent}
    
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text  
        except HTTPError as http_err:
            logger.error(f'HTTP error occurred for {url}: {http_err}')
        except Exception as err:
            logger.error(f'An error occurred for {url}: {err}')
        time.sleep(delay)
        delay *= 2
    return None

def scrape_news_from_feed(feed_url, limit=20):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)
    
    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break
            
        google_news_url = entry.link
        article_url = get_final_url(google_news_url)
        
        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)
            article.set_html(downloaded_article)
            try:
                article.parse()
                publish_date = article.publish_date
                publish_date_source = 'parsed'
                if publish_date is None and entry['published_parsed'] is not None:
                    publish_date = datetime.fromtimestamp(mktime(entry['published_parsed']))
                    publish_date_source = 'approximated'
                elif publish_date is None:
                    publish_date = datetime.now()
                    publish_date_source = 'current_time'
                
                publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M:%S")
                
                articles.append({
                    'title': article.title,
                    'text': article.text, 
                    'publish_date': publish_date_str,
                    'publish_date_source': publish_date_source,
                    'authors': article.authors,
                    'canonical_link': article.canonical_link, 
                    'feed_link': article_url, 
                    'media_link': entry['source']['href'], 
                    'media_title': entry['source']['title']
                })
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            failed_parses.append({
                'title': entry.title,
                'text': None, 
                'publish_date': datetime.fromtimestamp(mktime(entry['published_parsed'])).strftime("%Y-%m-%d %H:%M:%S") if entry['published_parsed'] is not None else datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                'publish_date_source': 'approximated' if entry['published_parsed'] is not None else 'current_time',
                'authors': None,
                'canonical_link': None,
                'feed_link': article_url, 
                'media_link': entry['source']['href'], 
                'media_title': entry['source']['title'],
                'exception_class': None,
                'exception_text': None
            })
            
    return articles, failed_parses


# Execute the scraper
feed_url = 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen'
limit = 20

articles, failed_parses = scrape_news_from_feed(feed_url, limit)

# Convert articles and failed_parses to DataFrames for easy analysis and saving
articles_df = pd.DataFrame(articles)
failed_parses_df = pd.DataFrame(failed_parses)

# Save to Excel for your analysis
articles_df.to_excel('new_articles_df.xlsx', index=False)
failed_parses_df.to_excel('new_failed_parses_df.xlsx', index=False)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [None]:
# Convert articles and failed_parses to DataFrames for easy analysis and saving
articles_df = pd.DataFrame(articles)
failed_parses_df = pd.DataFrame(failed_parses)

In [None]:
articles_df

Unnamed: 0,title,text,publish_date,is_approx_date,authors,canonical_link,feed_link,media_link,media_title
0,The $8 billion Sam Bankman-Fried criminal tria...,"watch now\n\nA year ago, Sam Bankman-Fried was...",2023-10-03 00:00:00,False,[Mackenzie Sigalos],https://www.cnbc.com/2023/10/03/sam-bankman-fr...,https://news.google.com/rss/articles/CBMiZ2h0d...,https://www.cnbc.com,CNBC
1,China Stocks in Hong Kong Drop Most Since July...,(Bloomberg) -- Chinese stocks listed in Hong K...,2023-10-03 08:33:27,True,[Ishika Mookerjee],https://finance.yahoo.com/news/china-stocks-ho...,https://news.google.com/rss/articles/CBMiSWh0d...,https://finance.yahoo.com,Yahoo Finance
2,The ‘wild bunch’ have taken control of the bon...,The bond market is front and center for invest...,2023-10-03 10:37:00,True,[Barbara Kollmeyer],https://www.marketwatch.com/story/the-wild-bun...,https://news.google.com/rss/articles/CBMihwFod...,https://www.marketwatch.com,MarketWatch
3,Tesla reports lower than expected deliveries i...,Tesla announced Monday that it had delivered j...,2023-10-02 00:00:00,False,[],https://www.statesman.com/story/business/techn...,https://news.google.com/rss/articles/CBMinAFod...,https://www.statesman.com,Austin American-Statesman
4,Powerball jackpot climbs to $1.2 billion ahead...,It's now the third-largest purse in the Americ...,2023-10-03 09:04:22,True,[Abc News],https://abcnews.go.com/US/powerball-jackpot-1-...,https://news.google.com/rss/articles/CBMiVWh0d...,https://abcnews.go.com,ABC News
5,"Good news, homebuyers: This fall could finally...","US homebuyers might see a ""sweet spot"" in the ...",2023-10-03 00:00:00,False,[Huileng Tan],https://www.businessinsider.com/us-housing-mar...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.businessinsider.com,Business Insider
6,Welcome to the Great Internet Splintering,I've become a social-media ghost over the past...,2023-10-03 00:00:00,False,[Shubham Agarwal],https://www.businessinsider.com/social-media-s...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.businessinsider.com,Business Insider
7,World Bank turns pessimistic on Asia growth,What is included in my trial?\n\nDuring your t...,2023-10-02 17:15:49,True,[],,https://news.google.com/rss/articles/CBMiP2h0d...,https://www.ft.com,Financial Times
8,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,2023-10-03 02:48:45,True,[],,https://news.google.com/rss/articles/CBMib2h0d...,https://www.bloomberg.com,Bloomberg
9,"Ford, GM Lay Off About 500 Factory Workers as ...","This copy is for your personal, non-commercial...",2023-10-03 07:29:00,True,[Mike Colias],https://www.wsj.com/business/autos/ford-gm-lay...,https://news.google.com/rss/articles/CBMidmh0d...,https://www.wsj.com,The Wall Street Journal


In [None]:
failed_parses_df.to_excel('new_failed_parses_df.xlsx', index=False)

In [None]:
articles_df

Unnamed: 0,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title
0,The $8 billion Sam Bankman-Fried criminal tria...,"watch now\n\nA year ago, Sam Bankman-Fried was...",2023-10-03 00:00:00,parsed,[Mackenzie Sigalos],https://www.cnbc.com/2023/10/03/sam-bankman-fr...,https://news.google.com/rss/articles/CBMiZ2h0d...,https://www.cnbc.com,CNBC
1,The ‘wild bunch’ have taken control of the bon...,The bond market is front and center for invest...,2023-10-03 10:37:00,approximated,[Barbara Kollmeyer],https://www.marketwatch.com/story/the-wild-bun...,https://news.google.com/rss/articles/CBMihwFod...,https://www.marketwatch.com,MarketWatch
2,Powerball drawing: $1.04 billion jackpot winni...,CNN —\n\nGet your tickets ready: An estimated ...,2023-10-02 00:00:00,parsed,[Kelly Mccleary],https://www.cnn.com/2023/10/02/business/powerb...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.cnn.com,CNN
3,"Good news, homebuyers: This fall could finally...","US homebuyers might see a ""sweet spot"" in the ...",2023-10-03 00:00:00,parsed,[Huileng Tan],https://www.businessinsider.com/us-housing-mar...,https://news.google.com/rss/articles/CBMiYGh0d...,https://www.businessinsider.com,Business Insider
4,Welcome to the Great Internet Splintering,I've become a social-media ghost over the past...,2023-10-03 00:00:00,parsed,[Shubham Agarwal],https://www.businessinsider.com/social-media-s...,https://news.google.com/rss/articles/CBMiaGh0d...,https://www.businessinsider.com,Business Insider
5,Chipotle tests robotic line to make your burri...,"More robots are coming to Chipotle (CMG), and ...",2023-10-03 12:05:30,approximated,[],https://finance.yahoo.com/news/chipotle-tests-...,https://news.google.com/rss/articles/CBMieGh0d...,https://finance.yahoo.com,Yahoo Finance
6,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,2023-10-03 02:48:45,approximated,[],,https://news.google.com/rss/articles/CBMib2h0d...,https://www.bloomberg.com,Bloomberg
7,GM lays off 164 more workers due to UAW strike,General Motors has laid off more employees as ...,2023-10-02 18:43:17,approximated,[Breck Dumas],https://www.foxbusiness.com/economy/gm-lays-of...,https://news.google.com/rss/articles/CBMiRmh0d...,https://www.foxbusiness.com,Fox Business
8,Dow futures fall 100 points as Treasury yields...,Traders on the floor of the New York Stock Exc...,2023-10-02 00:00:00,parsed,"[Hakyung Kim Brian Evans, Hakyung Kim, Brian E...",https://www.cnbc.com/2023/10/02/stock-market-t...,https://news.google.com/rss/articles/CBMiRGh0d...,https://www.cnbc.com,CNBC
9,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,2023-10-03 03:35:41,approximated,[],,https://news.google.com/rss/articles/CBMicmh0d...,https://www.bloomberg.com,Bloomberg


In [None]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
import pandas as pd

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None


def download_article(url):
    retries = 3
    delay = 5
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except HTTPError as http_err:
            logger.error(f"HTTP error occurred for {url}: {http_err}")
        except Exception as err:
            logger.error(f"An error occurred for {url}: {err}")
        time.sleep(delay)
        delay *= 2
    return None


def scrape_news_from_feed(feed_url, limit=20):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)

    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break

        # Exclude YouTube links
        if "youtube.com" in entry.link:
            continue

        google_news_url = entry.link
        article_url = get_final_url(google_news_url)

        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)

            # Combine download methods
            if downloaded_article:
                article.set_html(downloaded_article)
            else:
                try:
                    article.download()
                except Exception as e:
                    logger.error(f"Failed to download article {article_url}: {str(e)}")
                    failed_parses.append(
                        {"title": entry.title, "feed_link": entry.link, "error": str(e)}
                    )
                    continue

            try:
                article.parse()
                publish_date = article.publish_date
                publish_date_source = "parsed"
                if publish_date is None and entry["published_parsed"] is not None:
                    publish_date = datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    )
                    publish_date_source = "approximated"
                elif publish_date is None:
                    publish_date = datetime.now()
                    publish_date_source = "current_time"

                publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M:%S")

                articles.append(
                    {
                        "title": article.title,
                        "text": article.text,
                        "publish_date": publish_date_str,
                        "publish_date_source": publish_date_source,
                        "authors": article.authors,
                        "canonical_link": article.canonical_link,
                        "feed_link": google_news_url,  # Original Google News link
                        "media_link": entry["source"]["href"],
                        "media_title": entry["source"]["title"],
                    }
                )
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            failed_parses.append(
                {
                    "title": entry.title,
                    "text": None,
                    "publish_date": datetime.fromtimestamp(
                        mktime(entry["published_parsed"])
                    ).strftime("%Y-%m-%d %H:%M:%S")
                    if entry["published_parsed"] is not None
                    else datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "publish_date_source": "approximated"
                    if entry["published_parsed"] is not None
                    else "current_time",
                    "authors": None,
                    "canonical_link": None,
                    "feed_link": google_news_url,
                    "media_link": entry["source"]["href"],
                    "media_title": entry["source"]["title"],
                    "exception_class": None,
                    "exception_text": None,
                }
            )

    return articles, failed_parses


# Execute the scraper
feed_url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen"
limit = 20

articles, failed_parses = scrape_news_from_feed(feed_url, limit)

# Convert articles and failed_parses to DataFrames for easy analysis and saving
articles_df = pd.DataFrame(articles)
failed_parses_df = pd.DataFrame(failed_parses)

# Save to Excel for your analysis
articles_df.to_excel("new_articles_df.xlsx", index=False)
failed_parses_df.to_excel("new_failed_parses_df.xlsx", index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [None]:
import feedparser
import logging
import time
import requests
from requests.exceptions import HTTPError
from newspaper import Article
from datetime import datetime
from time import mktime

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('app.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

from urllib.parse import urlparse, parse_qs

def get_final_url(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response.url  # this will be the final URL after all the redirects
    except requests.RequestException as e:
        logger.error(f"Error resolving redirect URL {url}: {e}")
        return None


def download_article(url):
    retries = 3
    delay = 5
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    headers = {'User-Agent': user_agent}
    
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text  # Return the HTML content of the page
        except HTTPError as http_err:
            logger.error(f'HTTP error occurred for {url}: {http_err}')
        except Exception as err:
            logger.error(f'An error occurred for {url}: {err}')
        time.sleep(delay)
        delay *= 2
    return None


def scrape_news_from_feed(feed_url):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)
    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break
        google_news_url = entry.link
        article_url = get_final_url(google_news_url)  # Resolve the Google News URL to the final article URL
        
        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)
            article.set_html(downloaded_article)  # Set the HTML content manually
            try:
                article.parse()
                # Your existing logic to append to the articles list
                articles.append({
                    'title': article.title,
                    'text': article.text, 
                    'publish_date': article.publish_date,
                    'authors': article.authors,
                    'canonical_link': article.canonical_link, 
                    'feed_link': article_url, 
                    'media_link': entry['source']['href'], 
                    'media_title': entry['source']['title']
                })
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            # Handle the case where the article could not be downloaded
            # Your existing logic to append to the failed_parses list
            failed_parses.append({
                'title': entry.title,
                'text': None, 
                'publish_date': datetime.fromtimestamp(mktime(entry['published_parsed'])),
                'authors': None,
                'canonical_link': None,
                'feed_link': article_url, 
                'media_link': entry['source']['href'], 
                'media_title': entry['source']['title'],
                'exception_class': None,
                'exception_text': None
            })
            
    return articles, failed_parses


# Execute the scraper
feed_url = 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen'
limit = 20

articles, failed_parses = scrape_news_from_feed(feed_url)

0


ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:An unexpected error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: You must `download()` an article first!


1
2
3
4
5
6
7
8
9
10
11
12
13


ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html
ERROR:__main__:An unexpected error occurred for https://www.nytimes.com/2023/09/29/business/economy/pce-inflation-fed-august.html: You must `download()` an article first!


14


ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html
ERROR:__main__:An unexpected error occurred for https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html: You must `download()` an article first!


15
16
17
18
19
20


In [None]:
import feedparser
from datetime import datetime
from time import mktime

from bs4 import BeautifulSoup
from dateutil import parser

def extract_date_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Try to extract date from meta tags
    for meta in soup.find_all('meta'):
        if 'property' in meta.attrs and meta.attrs['property'].lower() in ['article:published_time', 'og:published_time']:
            try:
                return parser.parse(meta.attrs['content'])
            except (ValueError, TypeError):
                continue  # if parsing fails, continue to the next meta tag
    # If no suitable meta tag is found, return None
    return None


def modified_scrape_news_from_feed(feed_url):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)
    for i, entry in enumerate(feed.entries):
        if i == limit:
            break
        google_news_url = entry.link
        article_url = get_final_url(google_news_url)  # Resolve the Google News URL to the final article URL
        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)
            article.set_html(downloaded_article)  # Set the HTML content manually
            try:
                article.parse()
                # Try to extract publication date from HTML if not found by newspaper
                if article.publish_date is None:
                    article.publish_date = extract_date_from_html(downloaded_article)
                # If still not found, fallback to feed date
                if article.publish_date is None:
                    article.publish_date = datetime.fromtimestamp(mktime(entry['published_parsed']))
                articles.append({
                    'title': article.title,
                    'text': article.text,
                    'publish_date': article.publish_date,
                    'authors': article.authors,
                    'canonical_link': article.canonical_link,
                    'feed_link': article_url,
                    'media_link': entry['source']['href'],
                    'media_title': entry['source']['title']
                })
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            failed_parses.append({
                'title': entry.title,
                'text': None,
                'publish_date': datetime.fromtimestamp(mktime(entry['published_parsed'])),
                'authors': None,
                'canonical_link': None,
                'feed_link': article_url,
                'media_link': entry['source']['href'],
                'media_title': entry['source']['title'],
                'exception_class': None,
                'exception_text': None
            })
    return articles, failed_parses


# Execute the scraper with the modified function
feed_url = 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen'
limit = 20



# Execute the scraper with the modified function
modified_articles, modified_failed_parses = modified_scrape_news_from_feed(feed_url)

# Checking the proportion of None values in publish_date column after modification
none_dates_count = sum(1 for article in modified_articles if article['publish_date'] is None)
none_dates_proportion = none_dates_count / len(modified_articles) * 100 if modified_articles else 0
none_dates_proportion

NameError: name 'get_final_url' is not defined

In [None]:
articles_df = pd.DataFrame(articles)
articles_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title
0,"National Coffee Day 2023: Dunkin', Krispy Krem...","Whether you call it joe, java, jitter juice, b...",2023-09-28 00:00:00,[],https://www.usatoday.com/story/money/food/2023...,https://www.usatoday.com/story/money/food/2023...,https://www.usatoday.com,USA TODAY
1,From Social Security to travel: Everything to ...,"Social Security payments will continue, but ot...",,[Abc News],https://abcnews.go.com/Politics/social-securit...,https://abcnews.go.com/Politics/social-securit...,https://abcnews.go.com,ABC News
2,"S&P 500 dips after US inflation data, ending w...",FILE PHOTO:Traders work on the floor of the Ne...,2023-09-29 00:00:00,[Lewis Krauskopf Shashwat Chauhan Shristi Acha...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com,Reuters
3,Blue Apron to be acquired by Wonder Group for ...,In this article APRN Follow your favorite stoc...,2023-09-29 00:00:00,[Drew Richardson],https://www.cnbc.com/2023/09/29/blue-apron-to-...,https://www.cnbc.com/2023/09/29/blue-apron-to-...,https://www.cnbc.com,CNBC
4,‘I see more fear than any time in my business ...,“‘What the world is missing today is hope. I s...,,"[Barbara Kollmeyer, Larry Fink]",https://www.marketwatch.com/story/i-see-more-f...,https://www.marketwatch.com/story/i-see-more-f...,https://www.marketwatch.com,MarketWatch
5,Ford accuses UAW of holding contract negotiati...,Ford CEO Jim Farley accused the United Auto Wo...,2023-09-29 00:00:00,[Andrew J. Hawkins],https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com,The Verge
6,United Airlines Pilots to Get Pay Raise of as ...,"This copy is for your personal, non-commercial...",,[Alison Sider],https://www.wsj.com/business/airlines/united-a...,https://www.wsj.com/business/airlines/united-a...,https://www.wsj.com,The Wall Street Journal
7,Mortgage rates hit 23-year high: Freddie Mac,The average interest rate for a 30-year fixed-...,,[Javier Simon],https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com,Fox Business
8,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,,[],,https://www.bloomberg.com/tosv2.html?vid=&uuid...,https://www.bloomberg.com,Bloomberg
9,"Dow sheds more than 100 points Friday, S&P 500...",The Dow Jones Industrial Average retreated on ...,2023-09-28 00:00:00,"[Alex Harring Sarah Min Pia Singh, Alex Harrin...",https://www.cnbc.com/2023/09/28/stock-market-t...,https://www.cnbc.com/2023/09/28/stock-market-t...,https://www.cnbc.com,CNBC


In [None]:
articles_df['publish_date'] = articles_df['publish_date'].astype(str)

In [None]:
articles_df.to_excel('articles_df.xlsx')

In [None]:
failed_parses_df = pd.DataFrame(failed_parses)
failed_parses_df

In [63]:
from bs4 import BeautifulSoup
from dateutil import parser

def extract_date_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Try to extract date from meta tags
    for meta in soup.find_all('meta'):
        if 'property' in meta.attrs and meta.attrs['property'].lower() in ['article:published_time', 'og:published_time']:
            try:
                return parser.parse(meta.attrs['content'])
            except (ValueError, TypeError):
                continue  # if parsing fails, continue to the next meta tag
    # If no suitable meta tag is found, return None
    return None


def modified_scrape_news_from_feed(feed_url):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)
    for i, entry in enumerate(feed.entries):
        if i == limit:
            break
        google_news_url = entry.link
        article_url = get_final_url(google_news_url)  # Resolve the Google News URL to the final article URL
        if article_url:
            downloaded_article = download_article(article_url)
            article = Article(article_url)
            article.set_html(downloaded_article)  # Set the HTML content manually
            try:
                article.parse()
                # Try to extract publication date from HTML if not found by newspaper
                if article.publish_date is None:
                    article.publish_date = extract_date_from_html(downloaded_article)
                # If still not found, fallback to feed date
                if article.publish_date is None:
                    article.publish_date = datetime.fromtimestamp(mktime(entry['published_parsed']))
                articles.append({
                    'title': article.title,
                    'text': article.text,
                    'publish_date': article.publish_date,
                    'authors': article.authors,
                    'canonical_link': article.canonical_link,
                    'feed_link': article_url,
                    'media_link': entry['source']['href'],
                    'media_title': entry['source']['title']
                })
            except Exception as err:
                logger.error(f"An unexpected error occurred for {article_url}: {err}")
        else:
            failed_parses.append({
                'title': entry.title,
                'text': None,
                'publish_date': datetime.fromtimestamp(mktime(entry['published_parsed'])),
                'authors': None,
                'canonical_link': None,
                'feed_link': article_url,
                'media_link': entry['source']['href'],
                'media_title': entry['source']['title'],
                'exception_class': None,
                'exception_text': None
            })
    return articles, failed_parses


# Execute the scraper with the modified function
feed_url = 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen'
limit = 20

modified_articles, modified_failed_parses = modified_scrape_news_from_feed(feed_url)

# Checking the proportion of None values in publish_date column after modification
none_dates_count = sum(1 for article in modified_articles if article['publish_date'] is None)
none_dates_proportion = none_dates_count / len(modified_articles) * 100
none_dates_proportion

ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html
ERROR:__main__:An unexpected error occurred for https://www.nytimes.com/2023/09/29/business/economy/uaw-strike.html: You must `download()` an article first!
ERROR:__main__:HTTP error occurred for https://www.nytimes.com/2023/09/29/business/interest-rates-energy-prices-stocks-economy.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2023/09/29

0.0

In [None]:
modified_articles

In [22]:
from newspaper import fulltext

feed = feedparser.parse(feed_url)
feed.entries[3].link

'https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vd3d3LnRoZXZlcmdlLmNvbS8yMDIzLzkvMjkvMjM4OTYxMzkvZm9yZC11YXctc3RyaWtlLWNvbnRyYWN0LWhvc3RhZ2UtZXYtYmF0dGVyeS1mYWN0b3J50gEA?oc=5'

In [24]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
headers = {'User-Agent': user_agent}

In [25]:
response = requests.get(feed.entries[3].link, headers=headers, timeout=10)

In [None]:
response.text

In [28]:
feed.entries[3]

{'title': 'Ford accuses UAW of holding contract negotiations “hostage” over EV battery plants - The Verge',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen',
  'value': 'Ford accuses UAW of holding contract negotiations “hostage” over EV battery plants - The Verge'},
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vd3d3LnRoZXZlcmdlLmNvbS8yMDIzLzkvMjkvMjM4OTYxMzkvZm9yZC11YXctc3RyaWtlLWNvbnRyYWN0LWhvc3RhZ2UtZXYtYmF0dGVyeS1mYWN0b3J50gEA?oc=5'}],
 'link': 'https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vd3d3LnRoZXZlcmdlLmNvbS8yMDIzLzkvMjkvMjM4OTYxMzkvZm9yZC11YXctc3RyaWtlLWNvbnRyYWN0LWhvc3RhZ2UtZXYtYmF0dGVyeS1mYWN0b3J50gEA?oc=5',
 'id': '2464198489',
 'guidislink': False,
 'published': 'Fri, 29 Sep 2023 17:55:52 GMT',
 'published_parsed': time.struct_time(tm_year=2023, tm_mo

In [36]:
articles_df = pd.DataFrame(articles)
articles_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title
0,"United Auto Workers strikes spread as 7,000 mo...",DETROIT (AP) — The United Auto Workers union e...,2023-09-29 12:29:44,[],https://apnews.com/article/autoworkers-detroit...,https://apnews.com/article/autoworkers-detroit...,https://apnews.com,The Associated Press
1,National Coffee Day 2023: Free Coffee Today Fr...,Americans drink about 517 million cups of coff...,,"[Katie Teague, Katie Is A Writer Covering All ...",https://www.cnet.com/culture/national-coffee-d...,https://www.cnet.com/culture/national-coffee-d...,https://www.cnet.com,CNET
2,"S&P 500 dips after US inflation data, ending w...",FILE PHOTO:Traders work on the floor of the Ne...,2023-09-29 00:00:00,[Lewis Krauskopf Shashwat Chauhan Shristi Acha...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com,Reuters
3,Ford accuses UAW of holding contract negotiati...,Ford CEO Jim Farley accused the United Auto Wo...,2023-09-29 00:00:00,[Andrew J. Hawkins],https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com,The Verge
4,‘I see more fear than any time in my business ...,“‘What the world is missing today is hope. I s...,,"[Barbara Kollmeyer, Larry Fink]",https://www.marketwatch.com/story/i-see-more-f...,https://www.marketwatch.com/story/i-see-more-f...,https://www.marketwatch.com,MarketWatch
5,United Airlines pilots approve new contract wi...,United Airlines pilots approved a new contract...,2023-09-29 00:00:00,[Leslie Josephs],https://www.cnbc.com/2023/09/29/united-airline...,https://www.cnbc.com/2023/09/29/united-airline...,https://www.cnbc.com,CNBC
6,Blue Apron shares surge on sale to Wonder Group,Shares of meal kit company Blue Apron skyrocke...,,[Joe Toppe],https://www.foxbusiness.com/markets/blue-apron...,https://www.foxbusiness.com/markets/blue-apron...,https://www.foxbusiness.com,Fox Business
7,Mortgage rates hit 23-year high: Freddie Mac,The average interest rate for a 30-year fixed-...,,[Javier Simon],https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com,Fox Business
8,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,,[],,https://www.bloomberg.com/tosv2.html?vid=&uuid...,https://www.bloomberg.com,Bloomberg
9,Shares of biotech startup Structure Therapeuti...,Shares of Structure Therapeutics rose more tha...,2023-09-29 00:00:00,[Annika Kim Constantino],https://www.cnbc.com/2023/09/29/obesity-pill-d...,https://www.cnbc.com/2023/09/29/obesity-pill-d...,https://www.cnbc.com,CNBC


In [35]:
failed_parses_df = pd.DataFrame(failed_parses)
failed_parses_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title,exception_class,exception_text
0,Government shutdown live updates: No deal in s...,,2023-09-29 16:21:40,,,,https://www.washingtonpost.com,The Washington Post,,


In [6]:
for el in failed_parses_df['canonical_link'].values:
    print(el)
    print('=' * 10)

451 Client Error: Unavailable For Legal Reasons for url: https://ktla.com/news/local-news/gas-prices-creep-up-again-but-help-may-be-on-the-way/
403 Client Error: Forbidden for url: https://www.wsj.com/world/china/evergrandes-new-woes-signal-long-slog-for-chinas-economy-a5403b7a
403 Client Error: Forbidden for url: https://www.wsj.com/finance/regulation/secs-whatsapp-fines-spread-further-across-wall-street-f1f097ea
403 Client Error: Forbidden for url: https://www.wsj.com/real-estate/luxury-homes/barstoolsdave-portnoy-buys-nantucket-home-for-a-record-42-million-c1c217a7
403 Client Error: Forbidden for url: https://www.axios.com/2023/09/29/sp500-spy-index-today-stock-updates
451 Client Error: Unavailable For Legal Reasons for url: https://ktla.com/news/powerball-ticket-worth-nearly-800k-sold-in-california/
403 Client Error: Forbidden for url: https://www.investors.com/market-trend/stock-market-today/dow-jones-gains-as-kevin-mccarthy-makes-shutdown-pledge-this-warren-buffett-stock-nears-en

In [9]:
# Required libraries, data, and regular expression patterns

import re

# Sample exception strings
exceptions = [
    "403 Client Error: Forbidden for url: https://www.axios.com/2023/09/20/general-motors-stellantis-uaw-strike",
    "403 Client Error: Forbidden for url: https://www.sfgate.com/tech/article/instacart-ceo-ipo-stock-mehta-18378600.php",
    "403 Client Error: Forbidden for url: https://www.thestreet.com/automotive/general-motors-delivers-hard-nosed-message-to-uaw-workers",
    "403 Client Error: Forbidden for url: https://www.wsj.com/finance/stocks/klaviyo-shares-surge-in-trading-debut-6928fff2",
    "429 Client Error: Unknown Error for url: https://thehill.com/homenews/education/4214939-biden-administration-cancels-37-million-in-student-loans-for-former-university-of-phoenix-students/",
    "HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=7)",
    "403 Client Error: Forbidden for url: https://www.wsj.com/business/elon-musk-spacex-sues-justice-department-hiring-discrimination-edd38f7e",
    "403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/anafaguy/2023/09/20/uber-eats-will-accept-food-stamps-beginning-next-year/",
    "403 Client Error: Forbidden for url: https://www.thestreet.com/memestocks/amc/amc-stock-the-3-strongest-buy-signals",
    "403 Client Error: Forbidden for url: https://www.wsj.com/economy/central-banking/federal-reserve-powell-interest-rates-ba600bf0",
    "403 Client Error: Forbidden for url: https://www.wsj.com/business/earnings/general-mills-gis-q1-earnings-report-2024-194d129f",
    "('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))"
]

# Regular expression patterns
text_without_url_pattern = r'^(.*)(?=https?:\/\/)'
url_pattern = r'https?:\/\/[^\s]+'

# Updated extraction logic
output = []

for exc in exceptions:
    # Extract URL
    url_match = re.search(url_pattern, exc)
    
    if url_match:
        # Extract text without a URL
        text_match = re.search(text_without_url_pattern, exc)
        if text_match:
            output.append(("Text Without URL:", text_match.group().strip()))
        output.append(("URL:", url_match.group()))
    else:
        output.append(("Text Without URL:", exc.strip()))
    
    output.append('-'*50)

output


[('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.axios.com/2023/09/20/general-motors-stellantis-uaw-strike'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.sfgate.com/tech/article/instacart-ceo-ipo-stock-mehta-18378600.php'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.thestreet.com/automotive/general-motors-delivers-hard-nosed-message-to-uaw-workers'),
 '--------------------------------------------------',
 ('Text Without URL:', '403 Client Error: Forbidden for url:'),
 ('URL:',
  'https://www.wsj.com/finance/stocks/klaviyo-shares-surge-in-trading-debut-6928fff2'),
 '--------------------------------------------------',
 ('Text Without URL:', '429 Client Error: Unknown Error for url:'),
 ('URL:',
  'https://thehill.com/homenews/education/4214939-biden-

In [20]:
feed = feedparser.parse(feed_url)

In [6]:
for key, value in feed.entries[3].items():
    print(key)
    print(value)
    print('=' * 10)

title
Disney to Invest $60 Billion in Theme Parks, Cruises Over Next Decade - The Wall Street Journal
title_detail
{'type': 'text/plain', 'language': None, 'base': 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen', 'value': 'Disney to Invest $60 Billion in Theme Parks, Cruises Over Next Decade - The Wall Street Journal'}
links
[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3Lndzai5jb20vYnVzaW5lc3MvbWVkaWEvZGlzbmV5LXRvLWludmVzdC02MC1iaWxsaW9uLWluLXRoZW1lLXBhcmtzLWNydWlzZXMtb3Zlci1uZXh0LWRlY2FkZS02ZmM4NzQyNtIBAA?oc=5'}]
link
https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3Lndzai5jb20vYnVzaW5lc3MvbWVkaWEvZGlzbmV5LXRvLWludmVzdC02MC1iaWxsaW9uLWluLXRoZW1lLXBhcmtzLWNydWlzZXMtb3Zlci1uZXh0LWRlY2FkZS02ZmM4NzQyNtIBAA?oc=5
id
2447222822
guidislink
False
published
Tue, 19 Sep 2023 13:49:00 GMT
published_parsed
time.struct_time(tm_year=2023, tm_mon=9, tm_mday=

In [54]:
import sys

import feedparser
import newspaper

from datetime import datetime
from time import mktime

import pandas as pd

feed_url = 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen'

import newspaper
import feedparser

limit = 20

def scrape_news_from_feed(feed_url):
    articles = []
    failed_parses = []
    feed = feedparser.parse(feed_url)
    for i, entry in enumerate(feed.entries):
        print(i)
        if i == limit:
            break
        # create a newspaper article object
        article = newspaper.Article(entry.link)
        # download and parse the article
        article.download()
        try:
            article.parse()
            articles.append({
                'title': article.title,
                'text': article.text, 
                'publish_date': article.publish_date,
                'authors': article.authors,
                'canonical_link': article.canonical_link, 
                'feed_link': feed.entries[i].link, 
                'media_link': feed.entries[i]['source']['href'], 
                'media_title': feed.entries[i]['source']['title']
                })
        except newspaper.article.ArticleException:
            err = sys.exc_info()
            failed_parses.append({
                'title': feed.entries[i].title,
                'text': None, 
                'publish_date': datetime.fromtimestamp(mktime(feed.entries[i]['published_parsed'])),
                'authors': None,
                'canonical_link': article.download_exception_msg, # after first two :: regex 403 Client Error: Forbidden for url:
                'feed_link': feed.entries[i].link, 
                'media_link': feed.entries[i]['source']['href'], 
                'media_title': feed.entries[i]['source']['title'],
                'exceprion_class': None, # exception class newspaper.article.ArticleException or err
                'exceprion_text': article.download_exception_msg # first two :: regex 403 Client Error: Forbidden for url:
                })
            for el in err:
                print(el)
        except Exception as err:
            # catch any other exceprion
            pass
    return articles, failed_parses


feed_url = feed_url
articles, failed_parses = scrape_news_from_feed(feed_url)

# print the extracted articles
# for article in articles:
#     print('Title:', article['title'])
#     print('Author:', article['author'])
#     print('Publish Date:', article['publish_date'])
#     print('Content:', article['content'])
#     print()

0
1
2
3
4
5
6
7
<class 'newspaper.article.ArticleException'>
Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/business/airlines/united-airlines-pilots-to-get-pay-raise-of-as-much-as-40-c6e15913 on URL https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vd3d3Lndzai5jb20vYnVzaW5lc3MvYWlybGluZXMvdW5pdGVkLWFpcmxpbmVzLXBpbG90cy10by1nZXQtcGF5LXJhaXNlLW9mLWFzLW11Y2gtYXMtNDAtYzZlMTU5MTPSAQA?oc=5
<traceback object at 0x7f80da1f7240>
8
9
10
11
12
13
14
15
16
17
18
19
20


In [55]:
old_articles_df = pd.DataFrame(articles)
old_articles_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title
0,U.A.W. Expands Strikes at Ford and G.M.,The United Automobile Workers union increased ...,2023-09-29 14:45:54+00:00,"[Neal E. Boudette, More About Neal E. Boudette]",https://www.nytimes.com/2023/09/29/business/ec...,https://news.google.com/rss/articles/CBMiQ2h0d...,https://www.nytimes.com,The New York Times
1,"National Coffee Day 2023: Dunkin', Krispy Krem...","Whether you call it joe, java, jitter juice, b...",,[],https://www.usatoday.com/story/money/food/2023...,https://news.google.com/rss/articles/CBMicWh0d...,https://www.usatoday.com,USA TODAY
2,Live updates House fails to pass short-term fu...,What to know about a possible government shutd...,2023-09-29 11:47:28.042000+00:00,[],https://www.washingtonpost.com/politics/2023/0...,https://news.google.com/rss/articles/CBMiT2h0d...,https://www.washingtonpost.com,The Washington Post
3,"S&P 500 dips after US inflation data, ending w...",FILE PHOTO:Traders work on the floor of the Ne...,2023-09-29 20:35:28+00:00,[Lewis Krauskopf Shashwat Chauhan Shristi Acha...,https://www.reuters.com/markets/us/futures-cli...,https://news.google.com/rss/articles/CBMiamh0d...,https://www.reuters.com,Reuters
4,Blue Apron to be acquired by Wonder Group for ...,In this article APRN Follow your favorite stoc...,2023-09-29 19:37:57+00:00,[Drew Richardson],https://www.cnbc.com/2023/09/29/blue-apron-to-...,https://news.google.com/rss/articles/CBMiXmh0d...,https://www.cnbc.com,CNBC
5,‘I see more fear than any time in my business ...,“‘What the world is missing today is hope. I s...,,"[Barbara Kollmeyer, Larry Fink]",https://www.marketwatch.com/story/i-see-more-f...,https://news.google.com/rss/articles/CBMieGh0d...,https://www.marketwatch.com,MarketWatch
6,Ford accuses UAW of holding contract negotiati...,Ford CEO Jim Farley accused the United Auto Wo...,2023-09-29 17:55:52.914000+00:00,[Andrew J. Hawkins],https://www.theverge.com/2023/9/29/23896139/fo...,https://news.google.com/rss/articles/CBMiX2h0d...,https://www.theverge.com,The Verge
7,Collection of 100 classic cars up for auction ...,An auction in Iowa this weekend is a car lover...,,[],https://www.usatoday.com/story/money/cars/2023...,https://news.google.com/rss/articles/CBMiYmh0d...,https://www.usatoday.com,USA TODAY
8,Mortgage rates hit 23-year high: Freddie Mac,The average interest rate for a 30-year fixed-...,,[Javier Simon],https://www.foxbusiness.com/personal-finance/m...,https://news.google.com/rss/articles/CBMiVGh0d...,https://www.foxbusiness.com,Fox Business
9,"Nasdaq rises, but stocks give up gains to clos...",Stocks retreated Friday to cap a brutal month ...,,[Hamza Shaban],https://finance.yahoo.com/news/nasdaq-rises-bu...,https://news.google.com/rss/articles/CBMigQFod...,https://finance.yahoo.com,Yahoo Finance


In [61]:
old_articles_df['publish_date'] = old_articles_df['publish_date'].astype(str)
old_articles_df.to_excel('old_articles_df.xlsx')

In [56]:
old_failed_parses_df = pd.DataFrame(failed_parses)
old_failed_parses_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title,exceprion_class,exceprion_text
0,United Airlines Pilots to Get Pay Raise of as ...,,2023-09-29 16:19:00,,403 Client Error: Forbidden for url: https://w...,https://news.google.com/rss/articles/CBMiZ2h0d...,https://www.wsj.com,The Wall Street Journal,,403 Client Error: Forbidden for url: https://w...


In [62]:
old_failed_parses_df['publish_date'] = old_failed_parses_df['publish_date'].astype(str)
old_failed_parses_df.to_excel('old_failed_parses_df.xlsx')

In [50]:
articles_df

Unnamed: 0,title,text,publish_date,authors,canonical_link,feed_link,media_link,media_title
0,UAW announces new strikes at GM and Ford plant...,DETROIT – The United Auto Workers union will e...,2023-09-29 00:00:00,[Michael Wayland],https://www.cnbc.com/2023/09/29/uaw-announces-...,https://www.cnbc.com/2023/09/29/uaw-announces-...,https://www.cnbc.com,CNBC
1,"National Coffee Day 2023: Dunkin', Krispy Krem...","Whether you call it joe, java, jitter juice, b...",2023-09-28 00:00:00,[],https://www.usatoday.com/story/money/food/2023...,https://www.usatoday.com/story/money/food/2023...,https://www.usatoday.com,USA TODAY
2,Citi’s government shutdown playbook: which sto...,Early Friday futures skirmishing suggests Wall...,,[Jamie Chisholm],https://www.marketwatch.com/story/volatility-g...,https://www.marketwatch.com/story/volatility-g...,https://www.marketwatch.com,MarketWatch
3,"S&P 500 dips after US inflation data, ending w...",FILE PHOTO:Traders work on the floor of the Ne...,2023-09-29 00:00:00,[Lewis Krauskopf Shashwat Chauhan Shristi Acha...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com/markets/us/futures-cli...,https://www.reuters.com,Reuters
4,Blue Apron to be acquired by Wonder Group for ...,In this article APRN Follow your favorite stoc...,2023-09-29 00:00:00,[Drew Richardson],https://www.cnbc.com/2023/09/29/blue-apron-to-...,https://www.cnbc.com/2023/09/29/blue-apron-to-...,https://www.cnbc.com,CNBC
5,Ford accuses UAW of holding contract negotiati...,Ford CEO Jim Farley accused the United Auto Wo...,2023-09-29 00:00:00,[Andrew J. Hawkins],https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com/2023/9/29/23896139/fo...,https://www.theverge.com,The Verge
6,Are you a robot?,Why did this happen?\n\nPlease make sure your ...,,[],,https://www.bloomberg.com/tosv2.html?vid=&uuid...,https://www.bloomberg.com,Bloomberg
7,United Airlines Pilots to Get Pay Raise of as ...,"This copy is for your personal, non-commercial...",,[Alison Sider],https://www.wsj.com/business/airlines/united-a...,https://www.wsj.com/business/airlines/united-a...,https://www.wsj.com,The Wall Street Journal
8,Mortgage rates hit 23-year high: Freddie Mac,The average interest rate for a 30-year fixed-...,,[Javier Simon],https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com/personal-finance/m...,https://www.foxbusiness.com,Fox Business
9,"Citigroup CEO Jane Fraser on layoffs, major ov...",Citigroup CEO Jane Fraser said on Friday that ...,2023-09-29 00:00:00,[],https://nypost.com/2023/09/29/citigroup-ceo-ja...,https://nypost.com/2023/09/29/citigroup-ceo-ja...,https://nypost.com,New York Post
