## Import packages

In [8]:
import requests
import pandas as pd
import numpy as np

# python mongo libraries
import pymongo

# Reading properties
from jproperties import Properties

from datetime import datetime

## Constants

In [9]:
FUNCTION = 'INCOME_STATEMENT'
BASE_URL = 'https://www.alphavantage.co/query?'

# List of fields we need to convert from string to integer
FIELDS_TO_INT = [
    'grossProfit', 'totalRevenue', 'costOfRevenue', 'costofGoodsAndServicesSold',
    'operatingIncome', 'sellingGeneralAndAdministrative', 'researchAndDevelopment',
    'operatingExpenses', 'investmentIncomeNet', 'netInterestIncome', 'interestIncome',
    'interestExpense', 'nonInterestIncome', 'otherNonOperatingIncome','depreciation',
    'depreciationAndAmortization', 'incomeBeforeTax', 'incomeTaxExpense', 'interestAndDebtExpense',
    'netIncomeFromContinuingOperations', 'comprehensiveIncomeNetOfTax', 'ebit', 'ebitda', 'netIncome'
]

## Load properties

In [10]:
# Initial;ize from property file
configs = Properties()

with open('config/insert_income_statements.properties', 'rb') as config_file:
     configs.load(config_file)

TICKERS = configs.get('TICKERS').data.split(',')
API_KEY = configs.get('ALPHAV_API_KEY').data
MONGO_URI = configs.get('MONGO_URI').data
DB = configs.get('DB').data
ANNUAL_COLLECTION = configs.get('ANNUAL_COLLECTION').data
QUARTERLY_COLLECTION = configs.get('QUARTERLY_COLLECTION').data

## Get income statements from Alpha Vantage

In [11]:
# Holds income statements data for each symbol
statements = {}

# Get earnings for each stock ticker
for ticker in TICKERS:
    response = requests.get(f'{BASE_URL}function={FUNCTION}&symbol={ticker}&apikey={API_KEY}')
    statements[ticker] = response.json()

## Utility method to create a DF
#### __Note:__ it uses the global variable _statements_

In [12]:
def create_ticker_df(ticker, report_type):
    if report_type.upper() == 'Q':
        df_ticker = pd.DataFrame(statements[ticker]['quarterlyReports'])
    elif report_type.upper() == 'A':
        df_ticker = pd.DataFrame(statements[ticker]['annualReports'])
    else:
        raise Exception('Unknown report type, valid types are Q or A')
        
    # Add a column for the ticker
    df_ticker['ticker'] = ticker
    
    for field in FIELDS_TO_INT:
        # non numeric are converted to NaN
        df_ticker[field] = pd.to_numeric(df_ticker[field], errors='coerce')

    # Convert to dates which are in strings in raw format
    df_ticker['fiscalDateEnding'] = pd.to_datetime(df_ticker['fiscalDateEnding'])

    # Sort by dates - we want the oldest date first (otherway around in the REST response).
    # Important for percentage change
    df_ticker = df_ticker.sort_values('fiscalDateEnding')
    
    # Store as thousands
    df_ticker[FIELDS_TO_INT] = df_ticker[FIELDS_TO_INT].apply(lambda x: x//1000)

    # Add the % change against each integer column
    for field in FIELDS_TO_INT:
        # Using diff approach as pct_change doesn't handle negative values correctly - https://github.com/pandas-dev/pandas/issues/40911    
        df_ticker[field + '_pctChange'] = df_ticker[field].diff() / df_ticker[field].abs().shift()

    return df_ticker

## Create DataFrame containing Annual and Quarterly income statements

In [13]:
# DFs to collect all the annual and quarterly income statements for tickers
df_annual = pd.DataFrame()
df_quarterly = pd.DataFrame()

# Loop through statements dictionary which holds earnings for each ticker symbol        
for ticker in statements:
    df_annual = pd.concat([df_annual, create_ticker_df(ticker, 'A')])
    df_quarterly = pd.concat([df_quarterly, create_ticker_df(ticker, 'Q')])

## Reset the index to ticker + fiscal date

In [14]:
# When setting the index, fiscal date ending and ticker are retained as columns; we need them in the collection
df_annual.set_index(['fiscalDateEnding', 'ticker'], drop=False, inplace=True)
df_quarterly.set_index(['fiscalDateEnding', 'ticker'], drop=False, inplace=True)
df_quarterly.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fiscalDateEnding,reportedCurrency,grossProfit,totalRevenue,costOfRevenue,costofGoodsAndServicesSold,operatingIncome,sellingGeneralAndAdministrative,researchAndDevelopment,operatingExpenses,...,depreciation_pctChange,depreciationAndAmortization_pctChange,incomeBeforeTax_pctChange,incomeTaxExpense_pctChange,interestAndDebtExpense_pctChange,netIncomeFromContinuingOperations_pctChange,comprehensiveIncomeNetOfTax_pctChange,ebit_pctChange,ebitda_pctChange,netIncome_pctChange
fiscalDateEnding,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-09-30,INTC,2018-09-30,USD,12360000,18829000,8386000,6803000,7349000,1605000,3428000,5011000,...,,,,,,,,,,
2018-12-31,INTC,2018-12-31,USD,11227000,18495000,9000000,7430000,6224000,1720000,3433000,5003000,...,0.11465,0.0,-0.211005,-0.408602,0.651376,-0.188028,-0.16604,-0.204799,-0.203397,-0.188028
2019-03-31,INTC,2019-03-31,USD,9089000,15733000,8555000,6972000,4174000,1583000,3332000,4915000,...,0.061429,0.0,-0.193079,0.302273,-0.233333,-0.235034,-0.223328,-0.187478,-0.185867,-0.235034
2019-06-30,INTC,2019-06-30,USD,9878000,16337000,8450000,6627000,4617000,1639000,3438000,5261000,...,-0.035442,0.0,0.038927,-0.048866,-0.021739,0.051585,0.056832,0.03714,0.036748,0.051585
2019-09-30,INTC,2019-09-30,USD,11295000,19029000,9535000,7895000,6447000,1536000,3208000,4848000,...,0.054884,0.0,0.422312,0.337615,-0.207407,0.433357,0.347826,0.404816,0.400693,0.433357


## Insert Annual and Quarterly Income Statements to MongoDB

In [15]:
# Initialize mongo client
client = pymongo.MongoClient(MONGO_URI)

with client:
    # The database
    db = client[DB]
    db[ANNUAL_COLLECTION].insert_many(df_annual.to_dict('records'))
    db[QUARTERLY_COLLECTION].insert_many(df_quarterly.to_dict('records'))

## Sanity check #1
### The first record pctChange should be NaN

In [16]:
df_quarterly[['netIncome_pctChange', 'netIncome']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,netIncome_pctChange,netIncome
fiscalDateEnding,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-30,INTC,,6398000
2018-12-31,INTC,-0.188028,5195000
2019-03-31,INTC,-0.235034,3974000
2019-06-30,INTC,0.051585,4179000
2019-09-30,INTC,0.433357,5990000


## Sanity check #2
### with other another Ticker, the first record for the ticker should have NaN for the pctChage column

In [17]:
df_quarterly[df_quarterly['ticker'] == 'AMD'][['netIncome_pctChange', 'netIncome']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,netIncome_pctChange,netIncome
fiscalDateEnding,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-30,AMD,,116000
2018-09-30,AMD,-0.12069,102000
2018-12-31,AMD,-0.627451,38000
2019-03-31,AMD,-0.578947,16000
2019-06-30,AMD,1.1875,35000
