In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import math
import os
import time
import re
import html

#conda activate my_generic_1

In [2]:
#ArticleReader
class Article_Reader():
    def __init__(self):
        self.article_data = pd.DataFrame()
        
    def show_headlines(self, num):
        if not self.article_data.empty:
            print(self.article_data[['heading','href']].head(num))
            return
        print("No articles loaded.")
        
    def read_article(self, link):
        try:
            response = requests.get(link['href'])
            soup = BeautifulSoup(response.content, "html.parser")
        except:
            print("Exceptional URL: "+link['href'])
            return 0
        article = soup.find(re.compile("article"))
        bodies_found = 0
        link['text'] = link['heading']
        if article:
            link['text'] += article.text
            return 1
        else:
            article_elems = soup.find("div",class_=re.compile("article"))
            if article_elems is None:
                print("No article found: " +link['href'])
                return 0
            for a in article_elems:
                txt = self.clean_text(a.text)
                # Para limit of 10 words
                para_condition = len(txt.split(' ')) > 10
                if para_condition:
                    link['text'] += txt+' '
            return 1
        print("No article element found.")
        return 0
        
    def check_article(self, heading, link, main_url):
        link['href'] = link['href'] if not link['href'].startswith('/') else main_url+link['href']
        # Heading limit of at least 5 words
        heading_condition = len(heading.split(' '))<5
        if heading_condition:
            return 0
        res = self.read_article(link)
        return res
    
    def clean_text(self, txt):
        _re_space = re.compile(r"\s+")
        txt = _re_space.sub(" ", txt).strip()
        txt = txt.replace('\n',' ')
        return txt
    
    def load_headlines(self, urls):
        for url in urls:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            articles = soup.find_all("a", class_=re.compile('head'))
            print("Loading headline articles: ", url)
            print("Found a: "+str(len(articles)))
            # Article limit of at least 5 per home link
            if len(articles) < 20:
                found = soup.find_all('a', href=True)
                articles+=found
                print("Other links found: "+str(len(found)))
                        
            for article in articles:
                article_name = self.clean_text(article.text)
                info_dict={'href':article['href'], 'heading': article_name, 'text':''}
                read = self.check_article(article_name, info_dict, url)
                if read:
                    info_dict = {k:[v] for k,v in info_dict.items()}
                    self.article_data = pd.concat([self.article_data, pd.DataFrame(info_dict)], ignore_index=True)
        return self.article_data
    
    def save_corpus(self, corpus_name, path):
        self.article_data.to_csv(path+corpus_name+'_article_corpus.csv')
        
        
    

In [3]:
ar = Article_Reader()
output_path = 'output/'
corpus_name = 'financial'
financial_report_links = [
    "https://www.barrons.com",
    "https://www.bloomberg.com",
    "https://www.reuters.com",
    "https://www.ft.com",
    "https://www.marketwatch.com/markets",
    "https://finance.yahoo.com/",
    "https://www.investopedia.com/markets",
    "https://www.economist.com/",
    "https://www.economist.com/weeklyedition/2023-05-20",
    "https://www.theguardian.com/us/business",
    "https://www.theguardian.com/business/stock-markets",
    "https://www.morningstar.com",
    "https://www.fool.com"
]

found_headlines = ar.load_headlines(financial_report_links)
ar.save_corpus(corpus_name, output_path)
ar.show_headlines(5)

Loading headline articles:  https://www.barrons.com
Found a: 3
Other links found: 14
Loading headline articles:  https://www.bloomberg.com
Found a: 0
Other links found: 4
Loading headline articles:  https://www.reuters.com
Found a: 38
Loading headline articles:  https://www.ft.com
Found a: 226
Loading headline articles:  https://www.marketwatch.com/markets
Found a: 0
Other links found: 367
Loading headline articles:  https://finance.yahoo.com/
Found a: 21
Loading headline articles:  https://www.investopedia.com/markets
Found a: 1
Other links found: 215
Exceptional URL: #
Loading headline articles:  https://www.economist.com/
Found a: 11
Other links found: 224
No article found: https://www.economist.com//weeklyedition/2023-05-20
No article found: https://www.economist.com//economics-a-to-z/
No article found: https://www.economist.com//economics-a-to-z/
No article found: https://www.economist.com//weeklyedition/2023-05-20
No article found: https://www.economist.com//special-report/2023-0

No article found: https://www.economist.com/weeklyedition/2023-05-20/special-report/2023/05/16/video-insights-from-the-author
No article found: https://www.economist.com/weeklyedition/2023-05-20/business/2023/05/16/businesses-are-in-for-a-mighty-debt-hangover
No article found: https://www.economist.com/weeklyedition/2023-05-20/business/2023/05/14/the-aviation-industry-wants-to-be-net-zero-but-not-yet
No article found: https://www.economist.com/weeklyedition/2023-05-20/business/2023/05/18/the-wind-turbine-industry-should-be-booming-why-isnt-it
No article found: https://www.economist.com/weeklyedition/2023-05-20/business/2023/05/18/mukesh-ambani-returns-to-the-spotlight
No article found: https://www.economist.com/weeklyedition/2023-05-20/business/2023/05/18/americas-culture-wars-threaten-its-single-market
No article found: https://www.economist.com/weeklyedition/2023-05-20/finance-and-economics/2023/05/18/the-financial-system-is-slipping-into-state-control
No article found: https://www.e

In [4]:
print("Article Corpus Size: "+str(found_headlines.shape[0]))
found_headlines.head()

Article Corpus Size: 446


Unnamed: 0,href,heading,text
0,https://www.barrons.com/articles/tesla-stock-p...,Tesla Stock's Twitter Overhang Is Gone. Here's...,Tesla Stock's Twitter Overhang Is Gone. Here's...
1,https://www.barrons.com/articles/washington-de...,The FTC Is Coming Down Hard on Deal Making. Wh...,The FTC Is Coming Down Hard on Deal Making. Wh...
2,https://www.barrons.com/articles/berkshire-sto...,Berkshire Buys Still Get a Buffett Bounce—but ...,Berkshire Buys Still Get a Buffett Bounce—but ...
3,https://www.barrons.com/articles/tesla-stock-p...,Tesla Stock's Twitter Overhang Is Gone. Here's...,Tesla Stock's Twitter Overhang Is Gone. Here's...
4,https://www.barrons.com/articles/washington-de...,The FTC Is Coming Down Hard on Deal Making. Wh...,The FTC Is Coming Down Hard on Deal Making. Wh...
