# Run this in google colab
# Things to set up
1. Add FINNHUB_API_KEY to Secrets
2. Add the accepted_stocks.json file to the same directory as this folder

# Go to bottom to run code

In [None]:
!pip install feedparser finnhub-python



In [None]:
from datetime import datetime, timezone, timedelta
from abc import ABC, abstractmethod
from google.colab import userdata, drive
from typing import Optional
from tqdm import tqdm
import yfinance as yf
import feedparser
import requests
import finnhub
import time
import json
import sys
import os



```
# This is formatted as code
```

# Collecting Data from Yahoo and Zach Research (Finnhub)

In [57]:
class NewsSource:
    """
    Abstract class for fetching news for a given stock symbol
    """
    def __init__(self, accepted_stocks_location):
        self.accepted_stocks = {}
        self.accepted_stocks_path = accepted_stocks_location

    def _load_stocks(self) -> None:
        """
        Load the accepted_stocks.json file
        """
        with open (self.accepted_stocks_path) as f: # This is relative to main.py I think
            self.accepted_stocks = json.load(f)

        # make key the symbol
        self.accepted_stocks = {item.pop('symbol'): item for item in self.accepted_stocks}

    def _load_news(self, symbol: str) -> list:
        """
        Return a list of news for a given stock symbol
        """
        # make sure we accept the symbol
        if not self.accepted_stocks[symbol]:
            raise Exception("Stock symbol not found in accepted_stocks.json")

    def get_accepted_stocks(self) -> dict:
        """
        Return the accepted_stocks dictionary
        """
        return self.accepted_stocks

    @abstractmethod
    def get_news(self, *args, **kwargs) -> dict:
        """
        Subclasses should implement their logic to fetch news (possibly using a shared approach).
        Return format: {symbol: [news_items]}
        """
        pass


In [69]:


class FinnhubNews(NewsSource):
    def __init__(self, accepted_stocks_location):
        super().__init__(accepted_stocks_location)
        api_key = userdata.get('FINNHUB_API_KEY')
        if not api_key:
            raise ValueError("No FINNHUB_API_KEY found in google colab sercrets.")

        # Instantiate Finnhub client
        self.finnhub_client = finnhub.Client(api_key=api_key)


    def _load_news(self, symbol: str, start_date: str, end_date: str):
        """
        Return a list of news for a given stock symbol
        """
        super()._load_news(symbol)
        news_entries = self.finnhub_client.company_news(symbol, _from=start_date, to=end_date)
        return [{'title': item['headline'], 'published' : item['datetime']} for item in news_entries]


    def get_news(self, start_date: str, end_date: str, symbol: Optional[str] = None) -> dict:
        """
        Get news for all data in accepted_stocks.json
        If you pass in a symbol it will only update news for that specific symbol
        """
        # initialize accepted_stocks
        super()._load_stocks()

        if symbol:
            return {symbol: self._load_news(symbol, start_date, end_date)}

        news_data = {}
        for symbol in tqdm(self.accepted_stocks.keys(), desc=f'Getting finnhub company news from {start_date} to {end_date}'):
            news_data[symbol] = self._load_news(symbol, start_date, end_date)
            time.sleep(.75) # api limits
        return news_data


#print(FinnhubNews().get_news(symbol='AAPL',start_date='2024-06-01', end_date='2024-06-02'))

In [70]:


class YahooNews(NewsSource):
    def _load_news(self, symbol: str):
        """
        Return a list of news for a given stock symbol
        """
        super()._load_news(symbol)

        # make sure we accept the symbol
        if not self.accepted_stocks[symbol]:
            return "Stock symbol not found in accepted_stocks.json"

        # get the security name
        security = self.accepted_stocks[symbol]['security'].lower().strip()
        rss_url = f"https://finance.yahoo.com/rss/2.0/headline?s={symbol}"

        feed = feedparser.parse(rss_url)

        news_list = []
        for i , entry in enumerate(feed.entries):
            if security in entry.summary.lower(): # Todo: play around with not in vs in
                continue
            news_data = {
                'title': entry.title,
                'published': int(datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z').timestamp())  # unix timestamp
            }
            news_list.append(news_data)

        return news_list # [{title: 'title', published: 'published'}]

    def get_news(self, symbol: Optional[str] = None) -> dict:
        """
        Get news for all data in accepted_stocks.json
        If you pass in a symbol it will only update news for that specific symbol
        """
        super()._load_stocks()

        if symbol:
            return {symbol: self._load_news(symbol)}

        news_data = {}
        for symbol in tqdm(self.accepted_stocks.keys(), total=len(self.accepted_stocks.keys()), desc='Getting recent yahoo news'):
            news_data[symbol] = self._load_news(symbol)
        return news_data




In [60]:

class StockNewsManager:
    def __init__(self, directory='news'):
        self.directory = directory
        os.makedirs(self.directory, exist_ok=True)

    def load_news(self, symbol: str):
        file_path = os.path.join(self.directory, f"{symbol}.json")
        with open(file_path, 'r') as file:
            news_data = json.load(file)
        return news_data

    def save_news(self, symbol: str, news_list: list):
        file_path = os.path.join(self.directory, f"{symbol}.json")

        # Load existing news if the file exists
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                existing_news = json.load(file)
        else:
            existing_news = []

        # Append new news items
        existing_news.extend(news_list)

        # Save updated news list back to json file
        with open(file_path, 'w') as file:
            json.dump(existing_news, file, indent=2)

        print(f"News saved successfully for {symbol}!")

In [61]:
def remove_bad_entries(news: dict):
    """
    Remove news entries that do not have a 'published' key
    """
    for symbol, news_list in news.items():
        for news_source in news_list:
            if 'posting_price' not in news_source:
                news_list.remove(news_source)


In [62]:

def load_stock_data(news: list, accepted_stocks: dict):
    """
    Save the price data for all news data for each stock symbol
    """

    day_forecast = 7
    rounding_precision = 8

    # Get the price data for each symbol in the news directory
    for symbol in tqdm(accepted_stocks.keys(), desc='Getting price data for stocks'):
        if symbol not in news:
            continue
        recorded_dates = {} # list of dates we have already recorded stock data for
        for news_source in news[symbol]: # {'title':, 'published':}
            # Get the stock price at the time of the news

            # Get the dates for the stock data
            # We get current day and next because you cant input same day for both start and end
            news_obj = datetime.fromtimestamp(news_source['published'], tz=timezone.utc)
            start_date = str(news_obj).split(' ')[0]
            end_date = str(news_obj + timedelta(days=day_forecast)).split(' ')[0] # offset by day_forecast the amount of days to store

            # Check if we have data for that date
            if start_date not in recorded_dates:
                #print(f"Getting stock data for {symbol} from {start_date} to {end_date}")
                try: # Check if we have data for that date
                    yf_data = yf.download(tickers=symbol, start=start_date, end=end_date, progress=False)
                    for i in range(day_forecast):
                        # the next dates of the news
                        current_date = str(news_obj + timedelta(days=i)).split(' ')[0]

                        # Save the stock data for that date
                        recorded_dates[current_date] = {
                            'open': round(yf_data.iloc[i]['Open'].values[0], rounding_precision),
                            'close': round(yf_data.iloc[i]['Close'].values[-1], rounding_precision)
                            }

                        # if the next day is in the recorded_dates, break
                        if str(news_obj + timedelta(days=1)).split(' ')[0] in recorded_dates:
                            break

                except Exception as e: # index exception and api limit exception
                    #print(f"No stock data found for {symbol} on {start_date}, Error: ", e)
                    news[symbol].remove(news_source)
                    continue

            # Calculate the variables for that day
            posting_price = recorded_dates[start_date]['open']
            close_price = recorded_dates[start_date]['close']
            percent_change = round(((close_price - posting_price) / posting_price) * 100, rounding_precision)

            # Add the stock data to the news source
            news_source['posting_price'] = posting_price
            news_source['close_price'] = close_price
            news_source['percent_change'] = percent_change
            time.sleep(.1) # api limits

In [76]:

def load_news(start_date: str, end_date: str, current_file_path: str):
    # initialize news sources
    yahoo = YahooNews(current_file_path+'accepted_stocks2.json')
    finnhub = FinnhubNews(current_file_path+'accepted_stocks2.json')

    # initialize file manager for news data
    manager = StockNewsManager(current_file_path+'news')

    # Get title and published from sources
    finnhub_news = finnhub.get_news(start_date=start_date, end_date=end_date)
    yahoo_news = yahoo.get_news() # {symbol: [news_items]}

    # Add more news sources here
    all_news = [yahoo_news, finnhub_news]

    # save the stock prices
    for news in all_news:
        load_stock_data(news=news, accepted_stocks=yahoo.get_accepted_stocks())
        remove_bad_entries(news=news)

    # Save the news data to the file
    for news_data in all_news:
        for symbol, news_list in tqdm(news_data.items(), desc='Witing price data to file'):
            manager.save_news(symbol, news_list)



In [77]:
file_location='/content/drive/MyDrive/Colab Notebooks/' # news will be generated into a 'news' folder here, this is also where accepted_stocks.json is located
start_date = '2024-04-01'
end_date = '2024-04-30'
load_news(start_date=start_date, end_date=end_date, current_file_path=file_location)

Getting finnhub company news from 2024-04-01 to 2024-04-30: 100%|██████████| 9/9 [00:09<00:00,  1.06s/it]
Getting recent yahoo news: 100%|██████████| 9/9 [00:01<00:00,  6.90it/s]
Getting price data for stocks: 100%|██████████| 9/9 [00:10<00:00,  1.15s/it]
Getting price data for stocks: 100%|██████████| 9/9 [02:13<00:00, 14.79s/it]
Witing price data to file: 100%|██████████| 9/9 [00:00<00:00, 86.23it/s]


News saved successfully for MMM!
News saved successfully for AOS!
News saved successfully for ABT!
News saved successfully for ABBV!
News saved successfully for ACN!
News saved successfully for ADBE!
News saved successfully for AMD!
News saved successfully for AES!
News saved successfully for AFL!


Witing price data to file:   0%|          | 0/9 [00:00<?, ?it/s]

News saved successfully for MMM!
News saved successfully for AOS!
News saved successfully for ABT!
News saved successfully for ABBV!
News saved successfully for ACN!
News saved successfully for ADBE!
News saved successfully for AMD!
News saved successfully for AES!
News saved successfully for AFL!


Witing price data to file: 100%|██████████| 9/9 [00:00<00:00, 87.23it/s]
