Since the Yahoo News API from API dojo only allows pulling 50 articles at a time, we will just split the data into 7 groups representing 7 different days. The groups are based on the result of modulo 7 on a counter that counts up to 50.

In [1]:
import boto3
import json
import requests

from datetime import datetime
from decimal import Decimal

In [2]:
# Load helper functions:
# (1) Getting API Key from AWS Secrets Manager
# (2) Getting timestamps for UTC and EST/EDT.
%run aws_secret_manager.py
%run time_helper

In [3]:
# This is for testing, actual group should come from Airflow
ts = get_time_stamp(timezone_str='US/Eastern')
analysis_group = datetime.fromtimestamp(ts).strftime("%Y-%m-%d_Hour=%H")

In [4]:
def get_yahoo_stock_news(symbol: str, api_keys: dict) -> dict:
    url = "https://apidojo-yahoo-finance-v1.p.rapidapi.com/stock/get-news"

    querystring = {"region":"US","category":f"{symbol}"}

    headers = {
        'x-rapidapi-host': f"{api_keys['yahoo_api_host']}",
        'x-rapidapi-key': f"{api_keys['yahoo_api_key']}"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    
    r_dict = json.loads(response.text)
    
    return r_dict

In [5]:
def put_news_data(news_data: dict, table_name: str='test_data') -> dict:

    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    response = table.put_item(Item=news_data)
    
    return response

In [6]:
def test_data_7_days(starting_ts):
    """
    Splits the 50 articles from a single API call of a given stock into 7 groups using modulo 7.
    The groups are identified by the analysis window string which has the format: YYYY-MM-DD_Hour=HH
    The adjustment to each group is just subtracting X number of days from the starting timestamp
    where X is item_counter modulo 7.
    """
    
    stock_symbols = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG']
    api_keys = get_secrets(secret_name='news_api')
    
    responses = []
    for current_symbol in stock_symbols:
        current_api_source = 'YAHOO'
        
        r_dict = get_yahoo_stock_news(current_symbol, api_keys)

        # Find out how many articles were fetched and count number of digits for padding.
        # Left pad the item counter with zeros: 01, 02, ..., 12, ..., 50
        # Should not really affect sentiment score but rows looks nicer when sorting.
        total_num_articles = len(r_dict['items']['result'])
        digits = len(str(total_num_articles)) # Somewhat lazy method        

        # Time stamp for API call
        # TODO handle API call failure with Apache Airflow
        success_utc_ts = get_time_stamp(timezone_str='Universal')
        success_utc_str = get_time_string(timezone_str='Universal') 
        success_e_str = get_time_string(timezone_str='US/Eastern')
        
        item_counter = 1
        
        for news_data in r_dict['items']['result']:
            
            # Calculate current analysis group string
            days_to_subtract = item_counter % 7
            new_datetime = subtract_X_days(starting_ts, days_to_subtract)
            new_datetime = convert_datetime_timezone(new_datetime, timezone_str='US/Eastern')
            analysis_group = new_datetime.strftime("%Y-%m-%d_Hour=%H")
            analysis_date = analysis_group[:10] # Get YYYY-MM-DD which is 10 characters
            
            yahoo_link = news_data['link']
            news_timestamp = news_data['published_at']
            news_publisher = news_data['publisher']
            news_title = news_data['title']
            news_content = news_data['content']          

            item_counter_str = str(item_counter).rjust(digits, '0')
            
            data_row = {
                'analysis_window': analysis_group, # Partition Key
                'symb_counter_source': f'{current_symbol}_{item_counter_str}_api={current_api_source}', # Sort Key
                'analysis_date': analysis_date,
                'source_api': current_api_source,
                't_symb':current_symbol,
                'api_success_utc_ts': Decimal(str(success_utc_ts)),
                'api_success_utc_str': success_utc_str,
                'api_success_e_str': success_e_str,                
                'news_link': yahoo_link,
                'news_timestamp': Decimal(str(news_timestamp)),
                'news_publisher': news_publisher,
                'news_title': news_title,
                'news_content': news_content
            }
            
            response = put_news_data(data_row,'test_data')
            response_code = response['ResponseMetadata']['HTTPStatusCode']
            responses.append(response_code)
            
            # TODO replace with logger for production code
            print(f'Response code = {response_code}, for storing {current_symbol} news, item # = {item_counter}, with {current_api_source} API')
            item_counter += 1
            

In [7]:
starting_ts = get_time_stamp(timezone_str='US/Eastern')
test_data_7_days(starting_ts)

Response code = 200, for storing FB news, item # = 1, with YAHOO API
Response code = 200, for storing FB news, item # = 2, with YAHOO API
Response code = 200, for storing FB news, item # = 3, with YAHOO API
Response code = 200, for storing FB news, item # = 4, with YAHOO API
Response code = 200, for storing FB news, item # = 5, with YAHOO API
Response code = 200, for storing FB news, item # = 6, with YAHOO API
Response code = 200, for storing FB news, item # = 7, with YAHOO API
Response code = 200, for storing FB news, item # = 8, with YAHOO API
Response code = 200, for storing FB news, item # = 9, with YAHOO API
Response code = 200, for storing FB news, item # = 10, with YAHOO API
Response code = 200, for storing FB news, item # = 11, with YAHOO API
Response code = 200, for storing FB news, item # = 12, with YAHOO API
Response code = 200, for storing FB news, item # = 13, with YAHOO API
Response code = 200, for storing FB news, item # = 14, with YAHOO API
Response code = 200, for stor