In [2]:
import pandas as pd
import csv

df = pd.read_csv('TOTAL_US_STOCK_MARKET_CLEAN.csv')
df.head()

Unnamed: 0,index,ticker,name,finnhubIndustry,country,currency,exchange,ipo,marketCapitalization,marketCapClass,shareOutstanding
0,0,AACG,ATA Creativity Global,Diversified Consumer Services,CN,USD,NASDAQ NMS - GLOBAL MARKET,1/29/2008,27.29185,MICRO_CAP,56.626346
1,1,AAL,American Airlines Group Inc,Airlines,US,USD,NASDAQ NMS - GLOBAL MARKET,12/9/2013,7844.693,MID_CAP,422.894501
2,2,AAME,Atlantic American Corp,Insurance,US,USD,NASDAQ NMS - GLOBAL MARKET,1/16/1980,36.78906,MICRO_CAP,20.438366
3,3,AAOI,Applied Optoelectronics Inc,Communications,US,USD,NASDAQ NMS - GLOBAL MARKET,9/26/2013,197.1919,MICRO_CAP,20.339549
4,4,AAON,Aaon Inc,Building,US,USD,NASDAQ NMS - GLOBAL MARKET,1/3/1991,2980.886,MID_CAP,52.031532


# Segmentation
#### Creates a nested dictionary for each segment of the CSV
##### Market Cap -> Industry

In [8]:
# View the various categories
print('MARKET CAP: ', df.marketCapClass.unique())
print('INDUSTRIES: ', df.finnhubIndustry.unique())

MARKET CAP:  ['MICRO_CAP' 'MID_CAP' 'MEGA_CAP' 'SMALL_CAP' 'LARGE_CAP' 'marketCapClass']
INDUSTRIES:  ['Diversified Consumer Services' 'Airlines' 'Insurance' 'Communications'
 'Building' 'Technology' 'Logistics & Transportation'
 'Aerospace & Defense' 'Banking' 'Biotechnology' 'Health Care'
 'Pharmaceuticals' 'Semiconductors' 'Professional Services' 'Chemicals'
 'Life Sciences Tools & Services' 'Construction'
 'Hotels, Restaurants & Leisure' 'Electrical Equipment' 'Real Estate'
 'Financial Services' 'Machinery' 'Food Products' 'Telecommunication'
 'Energy ' 'Media' 'Retail' 'Leisure Products'
 'Textiles, Apparel & Luxury Goods' 'Commercial Services & Supplies'
 'Road & Rail' 'Utilities' 'Trading Companies & Distributors' 'Beverages'
 'Consumer products' 'Auto Components' 'Metals & Mining' 'Distributors'
 'Marine' 'Automobiles' 'Industrial Conglomerates' 'Paper & Forest'
 'Transportation Infrastructure' 'Packaging' 'finnhubIndustry' 'Tobacco']


In [71]:
d = {}
for marketCap in df.marketCapClass.unique():
    if marketCap == 'marketCapClass':
        continue
    d[marketCap] = {}
    for industry in df.finnhubIndustry.unique():
        if industry == 'finnhubIndustry':
            continue
        d[marketCap][industry] = list()

In [72]:
for ticker, data in df.iterrows():
    # Throw away headers
    if data['marketCapClass'] == 'marketCapClass':
        continue
    if data['finnhubIndustry'] == 'finnhubIndustry':
        continue
    # Create dictionary
    d[data['marketCapClass']][data['finnhubIndustry']].append(data['ticker'])
d

{'MICRO_CAP': {'Diversified Consumer Services': ['AACG',
   'ASPU',
   'CLCT',
   'DCAR',
   'HYRE',
   'LINC',
   'REDU',
   'SIC',
   'TEDU',
   'WAFU',
   'XSPA',
   'ZVO',
   'DAO',
   'DL',
   'FEDU',
   'LAIX',
   'RYB',
   'STG',
   'STON',
   'UTI',
   'AMBO'],
  'Airlines': ['MESA'],
  'Insurance': ['AAME',
   'AFH',
   'CNFR',
   'FNHC',
   'GLRE',
   'HALL',
   'ICCH',
   'ITIC',
   'KINS',
   'MHLD',
   'NSEC',
   'OXBR',
   'PIH',
   'PPHI',
   'PTVCB',
   'TIPT',
   'UNAM',
   'VERY',
   'AMBC',
   'KFS'],
  'Communications': ['AAOI',
   'AVNW',
   'BOSC',
   'CAMP',
   'CLFD',
   'CLRO',
   'CMBM',
   'CRNT',
   'DZSI',
   'EMKR',
   'EXF.TO',
   'JCS',
   'KVHI',
   'LTRX',
   'OCC',
   'PCTI',
   'RDCM',
   'RESN',
   'SILC',
   'TCCO',
   'TESS',
   'UTSI',
   'VISL',
   'WSTL',
   'BDR',
   'BKTI',
   'NTIP'],
  'Building': ['CCCL', 'DRT.TO', 'JCTCF', 'TGEN', 'AFI', 'APT'],
  'Technology': ['AEYE',
   'ALJJ',
   'ALLT',
   'ALOT',
   'ALYA.TO',
   'AMRH',
   'ANY',
 

# Simfin Integration
#### Uses a nested dictionary for each segment of the CSV to create dataframes from Simfin data
##### Market Cap -> Industry

In [74]:
import numpy as np
import simfin as sf
from simfin.names import *

In [75]:
API_KEY = 'MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg'

# SimFin data-directory.
sf.set_data_dir('~/simfin_data/')
# SimFin load API key or use free data.
sf.load_api_key('MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg')

market = 'us'

# Add this date-offset to the fundamental data such as
# Income Statements etc., because the REPORT_DATE is not
# when it was actually made available to the public,
# which can be 1, 2 or even 3 months after the Report Date.
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

df_fin_signals = hub.fin_signals(variant='daily')
df_growth_signals = hub.growth_signals(variant='daily')
df_val_signals = hub.val_signals(variant='daily')

# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Remove all rows with only NaN values.
df = df_signals.dropna(how='all').reset_index(drop=True)

# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)



# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

Dataset "us-income-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (5 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-2a38bb7d.pickle' on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-2a38bb7d.pickle' on disk (0 days old).
- Loading from disk ... Done!
Cache-file 'val_signals-739b68a6.pickle' on disk (0 days old).
- Loading from disk ... Done!
Cache-file 'mean_log_change-5cec82bd.pickle' on disk (0 days old).
- Loading from disk ... Done!
