In [1]:
import boto3
import json
import requests

from datetime import datetime
from decimal import Decimal

In [2]:
# Load helper functions:
# (1) Getting API Key from AWS Secrets Manager
# (2) Getting timestamps for UTC and EST/EDT.
%run aws_secret_manager.py
%run time_helper

In [3]:
def get_yahoo_stock_news(symbol: str, api_keys: dict) -> dict:
    url = "https://apidojo-yahoo-finance-v1.p.rapidapi.com/stock/get-news"

    querystring = {"region":"US","category":f"{symbol}"}

    headers = {
        'x-rapidapi-host': f"{api_keys['yahoo_api_host']}",
        'x-rapidapi-key': f"{api_keys['yahoo_api_key']}"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    
    r_dict = json.loads(response.text)
    
    return r_dict

## API helper functions usage examples:

This notebook contains examples for:

1. Loading and storing just one news article for a given stock.
2. A loop that repeats it for all stocks and for 50 news articles each.

### Get the API keys from AWS Secrets Manager

Make sure that the AWS access keys are set either through the awscli or set as environment variables.

In [4]:
api_keys = get_secrets(secret_name='news_api')

Getting yahoo news data for a given stock.

### (1) Fetch news related to Facebook (FB) with the api keys.

The `get_yahoo_stock_news` function should return the JSON string as a python dictionary.

In [5]:
stock_symbols = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG']

In [6]:
current_symbol = stock_symbols[0]
current_api_source = 'YAHOO'

In [7]:
r_dict = get_yahoo_stock_news(current_symbol, api_keys)

In [8]:
r_dict['items']['result'][0].keys()

dict_keys(['uuid', 'title', 'link', 'summary', 'publisher', 'author', 'type', 'entities', 'offnet', 'content', 'streams', 'ignore_main_image', 'published_at', 'main_image', 'is_magazine', 'reference_id'])

### Getting the current time in eastern timezone and utc.

In [9]:
# This is for testing, actual group should come from Airflow
ts = get_time_stamp(timezone_str='US/Eastern')
analysis_group = datetime.fromtimestamp(ts).strftime("%Y-%m-%d_Hour=%H")

In [10]:
success_utc_ts = get_time_stamp(timezone_str='Universal')
success_utc_str = get_time_string(timezone_str='Universal')
success_e_ts = get_time_stamp(timezone_str='US/Eastern')

### Create data tuple/dictionary

Data to be loaded and their type:
* [string] Ticker Symbol (t_symb)
* [float/Decimal(dynamoDb only stores Decimal when using boto3)] API Success Timestamp [UTC] (api_success_time)
* [string] API Success Time string [UTC] (api_success_time)
* [float/Decimal] API Success Timestamp [Eastern Time] (api_success_time)
* [string] Analysis Window Group (analysis_window)
* [string] News Source API (api)
* [string] News Link (news_link)
* [float/Decimal] News Timestamp (news_timestamp)
* [string] News Provider (news_provider)
* [string] News Title (news_title)
* [string] News Content (news_content)

In [None]:
# TODO Add nullable name to 3rd arg
fields = [StructField("t_symb", StringType(), True), 
          StructField("api_success_time", StringType(), True),
          StructField("t_symb", FloatType(), True),
          StructField("t_symb", StringType(), True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
          StructField("t_symb", StringType(), nullable=True), 
         ]
schema = StructType(fields)

Keys to be stored:
* (analysis_window) [string]: Analysis Window Group 
* (symb_id_source) [string]: Symbol_ID_Source, a sort key for DynamoDB. ID usually ranges from 1 to 50 and it indicates the i'th item in results
* (source_api) [string]: Current API Source for News (e.g. Yahoo Finance News)
* (t_symb) [string]: Ticker Symbol 
* API Success Timestamp [UTC] (api_success_utc_ts)
* API Success Timestamp [Eastern Time] (api_success_e_ts)
* API Success Time string [UTC] (api_success_utc_str)
* News Source API (api)
* News Link (news_link)
* News Timestamp (news_timestamp)
* News Provider (news_provider)
* News Title (news_title)
* News Content (news_content)


        'analysis_window': analysis_group, # Partition Key
        'symb_id_source': f'{current_symbol}_{item_counter_str}_api={current_api_source}', # Sort Key
        'source_api': current_api_source,
        't_symb':current_symbol,
        'api_success_utc_ts': Decimal(str(success_utc_ts)),
        'api_success_utc_ts': Decimal(str(success_e_ts)),
        'api_success_utc_str': success_utc_str,
        'news_link': yahoo_link,
        'news_timestamp': Decimal(str(news_timestamp)),
        'news_publisher': news_publisher,
        'news_title': news_title,
        'news_content': news_content

In [11]:
item_iterator = 2

In [12]:
r_dict['items']['result'][item_iterator]['link']

'https://finance.yahoo.com/news/stone-prosecutor-quit-case-protest-184754889.html'

In [13]:
yahoo_link = r_dict['items']['result'][item_iterator]['link']
news_timestamp = r_dict['items']['result'][item_iterator]['published_at']
news_publisher = r_dict['items']['result'][item_iterator]['publisher']
news_title = r_dict['items']['result'][item_iterator]['title']
news_content = r_dict['items']['result'][item_iterator]['content']

In [14]:
item_counter = item_iterator + 1

In [15]:
# The partition key and sort key makes up the primary key that uniquely identifies each row.
# The partition key is what DynamoDB uses to break the data into chunks for storage,
# since we usually grab all the data in the same analysis window, this should be more efficient.
data = {
    'analysis_window': analysis_group, # Partition Key
    'symb_id_source': f'{current_symbol}_{item_counter}_api={current_api_source}', # Sort Key
    'source_api': current_api_source,
    't_symb':current_symbol,
    'api_success_utc_ts': Decimal(str(success_utc_ts)),
    'api_success_e_ts': Decimal(str(success_e_ts)),
    'api_success_utc_str': success_utc_str,
    'news_link': yahoo_link,
    'news_timestamp': Decimal(str(news_timestamp)),
    'news_publisher': news_publisher,
    'news_title': news_title,
    'news_content': news_content
}

In [19]:
data

{'analysis_window': '2020-07-15_Hour=16',
 'symb_id_source': 'FB_3_api=YAHOO',
 't_symb': 'FB',
 'api_success_utc_ts': Decimal('1594855675.832861'),
 'api_success_utc_str': '2020-07-15 23:27:55 UTC+0000',
 'news_link': 'https://finance.yahoo.com/news/stone-prosecutor-quit-case-protest-184754889.html',
 'news_timestamp': Decimal('1594838874'),
 'news_publisher': 'Bloomberg',
 'news_title': 'Roger Stone Prosecutor Who Quit Case in Protest Is Joining Facebook',
 'news_content': '<p>(Bloomberg) -- A prosecutor on the trial team that won Roger Stone’s conviction is leaving the Justice Department following Trump administration interventions that effectively negated Stone’s prison sentence, according to people familiar with the matter.</p>\n<p>Michael Marando, who delivered part of the closing argument in Stone’s trial, will join Facebook Inc., where he will set policy on the site’s content, the people said.</p>\n<p>A spokeswoman for the U.S. attorney’s office in Washington, where Marando has

In [17]:
def put_news_data(news_data: dict, table_name: str='test_data') -> dict:

    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    response = table.put_item(Item=news_data)
    
    return response

In [20]:
response = put_news_data(data,'test_data')

### (2) Fetch 50 news articles each for all FAANG stocks and load them into DynamoDB

Basically the code above in a loop, we'll put it in a function to use with AWS Lambda later on.

In [20]:
# This is for testing, actual group should come from Airflow
ts = get_time_stamp(timezone_str='US/Eastern')
analysis_group = datetime.fromtimestamp(ts).strftime("%Y-%m-%d_Hour=%H")

In [21]:
def fetch_FAANG_yahoo_news(analysis_group: str):
    stock_symbols = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG']
    api_keys = get_secrets(secret_name='news_api')
    
    responses = []
    for current_symbol in stock_symbols:
        current_api_source = 'YAHOO'
        
        r_dict = get_yahoo_stock_news(current_symbol, api_keys)

        # Find out how many articles were fetched and count number of digits for padding.
        # Left pad the item counter with zeros: 01, 02, ..., 12, ..., 50
        # Should not really affect sentiment score but rows looks nicer when sorting.
        total_num_articles = len(r_dict['items']['result'])
        digits = len(str(total_num_articles)) # Somewhat lazy method        

        # Time stamp for API call
        # TODO handle API call failure with Apache Airflow
        success_utc_ts = get_time_stamp(timezone_str='Universal')
        success_utc_str = get_time_string(timezone_str='Universal')
        success_e_ts = get_time_stamp(timezone_str='US/Eastern')        
        
        item_counter = 1
        
        for news_data in r_dict['items']['result']:
            yahoo_link = news_data['link']
            news_timestamp = news_data['published_at']
            news_publisher = news_data['publisher']
            news_title = news_data['title']
            news_content = news_data['content']          

            item_counter_str = str(item_counter).rjust(digits, '0')
            
            data_row = {
                'analysis_window': analysis_group, # Partition Key
                'symb_id_source': f'{current_symbol}_{item_counter_str}_api={current_api_source}', # Sort Key
                'source_api': current_api_source,
                't_symb':current_symbol,
                'api_success_utc_ts': Decimal(str(success_utc_ts)),
                'api_success_utc_ts': Decimal(str(success_e_ts)),
                'api_success_utc_str': success_utc_str,
                'news_link': yahoo_link,
                'news_timestamp': Decimal(str(news_timestamp)),
                'news_publisher': news_publisher,
                'news_title': news_title,
                'news_content': news_content
            }
            
            response = put_news_data(data_row,'test_data')
            response_code = response['ResponseMetadata']['HTTPStatusCode']
            responses.append(response_code)
            
            # TODO replace with logger for production code
            print(f'Response code = {response_code}, for storing {current_symbol} news, item # = {item_counter}, with {current_api_source} API')
            item_counter += 1
            

In [24]:
fetch_FAANG_yahoo_news(analysis_group)

Response code = 200, for storing FB news, item # = 1, with YAHOO API
Response code = 200, for storing FB news, item # = 2, with YAHOO API
Response code = 200, for storing FB news, item # = 3, with YAHOO API
Response code = 200, for storing FB news, item # = 4, with YAHOO API
Response code = 200, for storing FB news, item # = 5, with YAHOO API
Response code = 200, for storing FB news, item # = 6, with YAHOO API
Response code = 200, for storing FB news, item # = 7, with YAHOO API
Response code = 200, for storing FB news, item # = 8, with YAHOO API
Response code = 200, for storing FB news, item # = 9, with YAHOO API
Response code = 200, for storing FB news, item # = 10, with YAHOO API
Response code = 200, for storing FB news, item # = 11, with YAHOO API
Response code = 200, for storing FB news, item # = 12, with YAHOO API
Response code = 200, for storing FB news, item # = 13, with YAHOO API
Response code = 200, for storing FB news, item # = 14, with YAHOO API
Response code = 200, for stor