# Investing.com News Scraper

## Import Libraries

In [None]:
import math
import sys
import requests
import datetime
from bs4 import BeautifulSoup as bs

## Define Classes

In [None]:
class NewsNode:
    def __init__(self, title, link, author, date, content):
        self.title = title
        self.link = link
        self.author = author
        self.date = date
        self.content = content
        
    def __str__(self):
        return f"{self.title}\n- by {self.author} [{self.date.strftime('%Y-%m-%d')}]\n" + \
            f"{self.link}\n" + f"{self.content}"
    
    def __repr__(self):
        return str(self)

In [None]:
class NewsCollection:
    def __init__(self, company):
        self.company = company
        self.news = []
        
    def news_count(self):
        return len(self.news)
    
    def add_news(self, news_node):
        assert isinstance(news_node, NewsNode)
        self.news.append(news_node)

In [None]:
class NewsScraper:
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    @staticmethod
    def parse_date(date_str):
        if len(date_str) == 0:
            return None
        
        date_str = date_str[3:]
        
        if "ago" in date_str:
            nhour = int(date_str.split()[0])
            current_time = datetime.datetime.now()
            publish_time = current_time - datetime.timedelta(hours=nhour)
            
            return datetime.datetime(publish_time.year, publish_time.month, publish_time.day)
        
        dt = datetime.datetime.strptime(date_str, "%b %d, %Y")
        return datetime.datetime(dt.year, dt.month, dt.day)
    
    @classmethod
    def fetch(cls, company, stop_date=None, verbose=False):
        assert isinstance(stop_date, datetime.datetime)
        
        news_collection = NewsCollection(company)
        stop_flag = False
    
        link = f"https://www.investing.com/equities/{company}-news"
        response = requests.get(link, headers=cls.headers)

        if response.status_code == 200:
            # Success
            soup = bs(response.content)

            # Build NewsNode
            page_links = soup.find_all("a", {"class": "pagination"})

            n_results = int(page_links[1]['title'].split()[-1])
            n_pages = math.ceil(n_results / 10)

            if verbose:
                print(f"Processing news for {company} [since {stop_date.strftime('%Y-%m-%d')}]")

            for pc in range(1, n_pages + 1):
                # Should the processing stop?
                if stop_flag:
                    break
                
                link = f"https://www.investing.com/equities/{company}-news/{pc}"

                if verbose:
                    if pc % 10 == 0 or pc == n_pages:
                        print(f"  Processing page {pc}...")

                if pc > 1:
                    response = requests.get(link, headers=headers)

                    if response.status_code != 200:
                        if verbose:
                            print(f"Request Failed [{response.status_code}]: {link}")
                        continue
                    else:
                        soup = bs(response.content)

                news_list = soup.select("#leftColumn .textDiv")

                for news in news_list:
                    news_a = news.find_all('a')[0]
                    news_title = news_a['title']
                    news_link = news_a['href']
                    
                    if news_link[0] == '/':
                        news_link = "https://www.investing.com" + news_link

                    news_div = news.select(".articleDetails")[0]
                    news_author = news_div.find_all('span')[0].contents
                    if len(news_author) == 0:
                        news_author = ""
                    else:
                        news_author = news_author[0]
                    news_author = news_author.replace("By ", "")
                    
                    news_date = news_div.select(".date")[0].contents
                    if len(news_date) == 0:
                        news_date = ""
                    else:
                        news_date = news_date[0]
                    news_date = NewsScraper.parse_date(news_date)
                    
                    if news_date is not None and news_date < stop_date:
                        stop_flag = True
                        break

                    news_content = news.find_all('p')[0].contents
                    if len(news_content) == 0:
                        news_content = ""
                    else:
                        news_content = news_content[0]

                    news_node = NewsNode(news_title, news_link, news_author, news_date, news_content)
                    news_collection.add_news(news_node)
                    
            if verbose:
                print(f"Done. {news_collection.news_count()} items retrieved.")
                print()
                    
            return news_collection

## Fetch News for Companies

In [None]:
companies = [
    "pepsico",
    "disney",
    "american-airlines-group"
]

In [None]:
news_lib = []

dt = datetime.datetime(2011, 1, 1)
for company in companies:
    news_collection = NewsScraper.fetch(company, stop_date=dt, verbose=True)
    news_lib.append(news_collection)

In [None]:
news_lib[0].news[-1].date

In [None]:
for i in range(20):
    print(news_lib[0].news[i])
    print()