In [1]:
from datetime import timedelta
import numpy as np
import pandas as pd
import yfinance as yf
import json

In [2]:
ticker_list = pd.read_csv('Russell3000Tickers.csv').values.flatten().tolist()

In [3]:
stock_dict = {}
for ticker in ticker_list:
    stock_dict[ticker] = yf.Ticker(ticker)

In [4]:
# aapl = stock_dict['AAPL']

In [5]:
# aapl.balance_sheet

In [6]:
# hist = aapl.history(period='16mo', interval='1d')

In [7]:
# info = aapl.info

In [8]:
# filter_names = ['sector', 'fullTimeEmployees', 'longBusinessSummary', 'state', 'country', 'industry', 'sharesOutstanding', 'sharesShort']
# filtered_info = { name: info[name] for name in filter_names}
# stock_info = { 'aapl' : filtered_info }
# a = pd.DataFrame.from_dict(stock_info, orient='index')

In [9]:
# qb = aapl.quarterly_balancesheet.T
# qf = aapl.quarterly_financials.T
# qc = aapl.quarterly_cashflow.T

In [10]:
# combined = pd.concat((qb, qc, qf), axis=1)

In [11]:
# historical_prices = {}
# for stock, obj in stock_dict.items():
#     historical_prices[stock] = obj.history(period='2y', interval='1d')

In [12]:
# quarter_data = {}
# for stock, obj in stock_dict.items():
#     qb = obj.quarterly_balancesheet.T
#     qf = obj.quarterly_financials.T
#     qc = obj.quarterly_cashflow.T
#     combined_df = pd.concat((qb, qc, qf), axis=1)
#     quarter_data[stock] = combined_df

In [13]:
# firm_description = {}
# filter_names = ['sector', 'longBusinessSummary', 'country', 'industry', 'sharesOutstanding', 'sharesShort']
# for stock, obj in stock_dict.items():
#     condensed_info = { name: obj.info[name] for name in filter_names }
#     dict_to_df = { stock : condensed_info }
#     firm_description[stock] = pd.DataFrame.from_dict(dict_to_df)

In [14]:
# create dictionaries to make dataset building process easier
historical_prices = {}
quarter_data = {}
firm_description = {}

filter_names = ['sector', 'longBusinessSummary', 'country', 'industry', 'sharesOutstanding', 'sharesShort']

for stock, obj in stock_dict.items():
    try:
        # share price & volume info
        historical_prices[stock] = obj.history(period='16mo', interval='1d')

        # quarterly info
        qb = obj.quarterly_balancesheet.T
        qf = obj.quarterly_financials.T
        qc = obj.quarterly_cashflow.T
        combined_df = pd.concat((qb, qc, qf), axis=1)
        quarter_data[stock] = combined_df

        # firm constant info
        condensed_info = { name: obj.info[name] for name in filter_names }
        firm_description[stock] = pd.DataFrame.from_records(condensed_info, index=[0])
        print('stock {} loaded'.format(stock))
    except:
        print('\n***problem loading stock : {}***\n'.format(stock))
        continue

stock A loaded
stock AA loaded
stock AAL loaded
stock AAOI loaded
stock AAP loaded
stock AAPL loaded
stock AAT loaded
stock AAWW loaded
stock AAXN loaded
stock ABBV loaded
stock ABEO loaded
stock ABG loaded
stock ABM loaded
stock ABMD loaded
stock ABT loaded
stock ABTX loaded
stock AC loaded
stock ACA loaded
stock ACAD loaded
stock ACBI loaded
stock ACC loaded
stock ACCO loaded
stock ACHC loaded
stock ACIA loaded
stock ACIW loaded
stock ACM loaded
stock ACNB loaded
stock ACOR loaded
stock ACRE loaded
stock ACRS loaded
stock ACRX loaded
stock ACTG loaded
stock ADBE loaded
stock ADC loaded
stock ADI loaded
stock ADM loaded
stock ADMA loaded
stock ADMS loaded
stock ADNT loaded
stock ADP loaded
stock ADPT loaded
stock ADRO loaded
stock ADS loaded
stock ADSK loaded
stock ADSW loaded
stock ADT loaded
stock ADTN loaded
stock ADUS loaded
stock ADVM loaded
stock AEE loaded
stock AEGN loaded
stock AEIS loaded
stock AEL loaded
stock AEP loaded
stock AERI loaded
stock AES loaded
stock AFG loaded
s

In [26]:
# remove stocks that don't exist for all dictionaries
remove_stocks = []
for stock_name in historical_prices.keys():
    if stock_name not in firm_description:
        remove_stocks.append(stock_name)

for stock_name in remove_stocks:
    del historical_prices[stock_name]

remove_stocks = []

for stock_name in quarter_data.keys():
    if stock_name not in firm_description:
        remove_stocks.append(stock_name)

for stock_name in remove_stocks:
    del quarter_data[stock_name]

In [15]:
# aapl_qtr = quarter_data['AAPL']
#
# quarter_dt = aapl_qtr.index.values
# quarter_dt.sort()
# prices_at_quarter = historical_prices['AAPL'].loc[quarter_dt]

In [40]:
# quarter dates for price retrieval
stock_quarter_dates = {}
inadequate_stocks = []
for stock, quarter_df in quarter_data.items():
    quarter_dts = quarter_df.index.values
    quarter_dts.sort()

    # delete stock if oldest price date doesn't go back far enough
    if historical_prices[stock].index.values[0] > quarter_dts[0] + np.timedelta64(-90, 'D'):
        inadequate_stocks.append(stock)
        continue

    stock_quarter_dates[stock] = pd.DataFrame()
    for date in quarter_dts:
        find_date = True
        while find_date:
            if date in historical_prices[stock].index:
                price_dict = historical_prices[stock].loc[date]
                price_df = pd.DataFrame.from_dict(price_dict)
                prev_df = stock_quarter_dates[stock]
                stock_quarter_dates[stock] = prev_df.append(price_df.T)
                find_date = False
            else:
                date += np.timedelta64(-1, 'D')
            if date < historical_prices[stock].index.values[0]:
                inadequate_stocks.append(stock)
                find_date = False
                break

In [28]:
for stock in inadequate_stocks:
    del quarter_data[stock]
    del historical_prices[stock]
    del firm_description[stock]

In [18]:
# column_list = np.concatenate((stock_quarter_dates['AAPL'].columns.values, quarter_data['AAPL'].columns.values, firm_description['AAPL'].columns.values))
# stock_df = pd.DataFrame(index=['Q1', 'Q2', 'Q3', 'Q4'], columns=column_list)
# quarter_price_df = stock_quarter_dates['AAPL']
# for i, index in enumerate(stock_df.index.values):
#         append_df = quarter_price_df.iloc[i]
#         append_df = append_df.append(quarter_data['AAPL'].iloc[i])
#         append_df = append_df.append(firm_description['AAPL'].iloc[0])
#         stock_df.loc[index] = append_df

In [77]:
# consolidate into singular df
stock_consolidated_dict = {}

for stock, quarter_price_df in stock_quarter_dates.items():
    # get columns to use
    column_list = np.concatenate((np.array(['Date']),stock_quarter_dates[stock].columns.values, quarter_data[stock].columns.values, firm_description[stock].columns.values))
    stock_df = pd.DataFrame(index=['2019_Q2', '2019_Q3', '2019_Q4', '2020_Q1'], columns=column_list)
    for i, index in enumerate(stock_df.index.values):
        append_series = pd.Series(quarter_price_df.index[i], index=['Date']).astype('str')
        append_series = append_series.append(quarter_price_df.iloc[i])
        append_series = append_series.append(quarter_data[stock].iloc[i])
        append_series = append_series.append(firm_description[stock].iloc[0])
        stock_df.loc[index] = append_series
    stock_consolidated_dict[stock] = stock_df.to_dict(orient='index')

  


In [70]:
# # sanity checks
# stock_check = ticker_list[-400]
# stock_consolidated_dict[stock_check]
#

In [78]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.datetime64):
            return str(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)


In [79]:
# convert dictionaries to json objects; save to directory
with open('stock_specific_info.json', 'w') as fp:
    json.dump(stock_consolidated_dict, fp, cls=NpEncoder)

In [80]:
historical_price_dict = {}
for stock in historical_prices:
    historical_prices[stock] = historical_prices[stock].loc[~historical_prices[stock].index.duplicated(keep='last')]
    historical_price_dict[stock] = historical_prices[stock].to_dict(orient='index')
    historical_price_dict[stock] = { str(key): value for key, value in historical_price_dict[stock].items() }

In [81]:
with open('stock_historic_prices.json', 'w') as fp:
    json.dump(historical_price_dict, fp, cls=NpEncoder)