In [1]:
import pandas as pd
import csv

df = pd.read_csv('TOTAL_US_STOCK_MARKET_CLEAN.csv')
df.head()

Unnamed: 0,index,ticker,name,finnhubIndustry,country,currency,exchange,ipo,marketCapitalization,marketCapClass,shareOutstanding
0,0,AACG,ATA Creativity Global,Diversified Consumer Services,CN,USD,NASDAQ NMS - GLOBAL MARKET,1/29/2008,27.29185,MICRO_CAP,56.626346
1,1,AAL,American Airlines Group Inc,Airlines,US,USD,NASDAQ NMS - GLOBAL MARKET,12/9/2013,7844.693,MID_CAP,422.894501
2,2,AAME,Atlantic American Corp,Insurance,US,USD,NASDAQ NMS - GLOBAL MARKET,1/16/1980,36.78906,MICRO_CAP,20.438366
3,3,AAOI,Applied Optoelectronics Inc,Communications,US,USD,NASDAQ NMS - GLOBAL MARKET,9/26/2013,197.1919,MICRO_CAP,20.339549
4,4,AAON,Aaon Inc,Building,US,USD,NASDAQ NMS - GLOBAL MARKET,1/3/1991,2980.886,MID_CAP,52.031532


# Segmentation
#### Creates a nested dictionary for each segment of the CSV
##### Market Cap -> Industry

In [None]:
# View the various categories
print('MARKET CAP: ', df.marketCapClass.unique())
print('INDUSTRIES: ', df.finnhubIndustry.unique())

In [2]:
d = {}
for marketCap in df.marketCapClass.unique():
    if marketCap == 'marketCapClass':
        continue
    d[marketCap] = {}
    for industry in df.finnhubIndustry.unique():
        if industry == 'finnhubIndustry':
            continue
        d[marketCap][industry] = list()

In [3]:
for ticker, data in df.iterrows():
    # Throw away headers
    if data['marketCapClass'] == 'marketCapClass':
        continue
    if data['finnhubIndustry'] == 'finnhubIndustry':
        continue
    # Create dictionary
    d[data['marketCapClass']][data['finnhubIndustry']].append(data['ticker'])
d

{'MICRO_CAP': {'Diversified Consumer Services': ['AACG',
   'ASPU',
   'CLCT',
   'DCAR',
   'HYRE',
   'LINC',
   'REDU',
   'SIC',
   'TEDU',
   'WAFU',
   'XSPA',
   'ZVO',
   'DAO',
   'DL',
   'FEDU',
   'LAIX',
   'RYB',
   'STG',
   'STON',
   'UTI',
   'AMBO'],
  'Airlines': ['MESA'],
  'Insurance': ['AAME',
   'AFH',
   'CNFR',
   'FNHC',
   'GLRE',
   'HALL',
   'ICCH',
   'ITIC',
   'KINS',
   'MHLD',
   'NSEC',
   'OXBR',
   'PIH',
   'PPHI',
   'PTVCB',
   'TIPT',
   'UNAM',
   'VERY',
   'AMBC',
   'KFS'],
  'Communications': ['AAOI',
   'AVNW',
   'BOSC',
   'CAMP',
   'CLFD',
   'CLRO',
   'CMBM',
   'CRNT',
   'DZSI',
   'EMKR',
   'EXF.TO',
   'JCS',
   'KVHI',
   'LTRX',
   'OCC',
   'PCTI',
   'RDCM',
   'RESN',
   'SILC',
   'TCCO',
   'TESS',
   'UTSI',
   'VISL',
   'WSTL',
   'BDR',
   'BKTI',
   'NTIP'],
  'Building': ['CCCL', 'DRT.TO', 'JCTCF', 'TGEN', 'AFI', 'APT'],
  'Technology': ['AEYE',
   'ALJJ',
   'ALLT',
   'ALOT',
   'ALYA.TO',
   'AMRH',
   'ANY',
 

# Simfin Integration
#### Uses a nested dictionary for each segment of the CSV to create dataframes from Simfin data
##### Market Cap -> Industry

In [4]:
import numpy as np
import simfin as sf
from simfin.names import *

In [5]:
API_KEY = 'MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg'

# SimFin data-directory.
sf.set_data_dir('~/simfin_data/')
# SimFin load API key or use free data.
sf.load_api_key('MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg')

market = 'us'

# Add this date-offset to the fundamental data such as
# Income Statements etc., because the REPORT_DATE is not
# when it was actually made available to the public,
# which can be 1, 2 or even 3 months after the Report Date.
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

df_fin_signals = hub.fin_signals(variant='daily')
df_growth_signals = hub.growth_signals(variant='daily')
df_val_signals = hub.val_signals(variant='daily')

# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Remove all rows with only NaN values.
df = df_signals.dropna(how='all').reset_index(drop=True)

# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)



# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

Dataset "us-income-ttm" on disk (12 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (12 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (12 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-2a38bb7d.pickle' on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (12 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (12 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (12 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-2a38bb7d.pickle' on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'val_signals-739b68a6.pickle' on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'mean_log_change-5cec82bd.pickle' on disk (1 days old).
- Loading from disk ... Done!


## Cap Segmented Simfin

In [6]:
#Make list of tickers from the simfin data
simfin_tickers = df_sig_rets.index.unique(level=0).to_list()
simfin_tickers

['A',
 'AAMC',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAWW',
 'ABAX',
 'ABBV',
 'ABC',
 'ABCD',
 'ABG',
 'ABM',
 'ABMC',
 'ABT',
 'ACAD',
 'ACAT',
 'ACCO',
 'ACET',
 'ACHC',
 'ACLS',
 'ACLZ',
 'ACN',
 'ACRX',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADS',
 'ADSK',
 'AE',
 'AEE',
 'AEHR',
 'AEP',
 'AEPI',
 'AES',
 'AGCO',
 'AGIO',
 'AGN_old',
 'AHPI',
 'AHS',
 'AIMC',
 'AIR',
 'AIRI',
 'AIT',
 'AJG',
 'AJRD',
 'AKAM',
 'AKRX',
 'AKS',
 'AL',
 'ALB',
 'ALE',
 'ALG',
 'ALGT',
 'ALJ',
 'ALK',
 'ALKS',
 'ALLE',
 'ALNY',
 'ALSK',
 'ALSN',
 'ALV',
 'ALXN',
 'AMAT',
 'AMCX',
 'AMD',
 'AME',
 'AMED',
 'AMG',
 'AMGN',
 'AMH',
 'AMKR',
 'AMOT',
 'AMP',
 'AMRK',
 'AMSC',
 'AMSWA',
 'AMT',
 'AMTD',
 'AMWD',
 'AMZN',
 'AN',
 'ANDE',
 'ANDV',
 'ANET',
 'ANF',
 'ANIK',
 'ANSS',
 'AOBC',
 'AOS',
 'AOSL',
 'APA',
 'APC',
 'APD',
 'APH',
 'APOG',
 'ARCB',
 'ARCW',
 'ARE',
 'AREX',
 'ARG',
 'ARQL',
 'ARRS',
 'ARRY',
 'ARW',
 'ASCMA',
 'ASNA',
 'ASPN',
 'ATHN',
 'ATI',
 'ATR',
 'ATRO',
 'ATVI',
 'ATW',
 'AVA',
 'AV

In [10]:
#initialize cap segmented ticker lists
simfin_mega = list()
simfin_large = list()
simfin_mid = list()
simfin_small = list()
simfin_micro = list()

In [11]:
#check initial cap value and assign tickers accordingly
for ticker in simfin_tickers:
    init_date = df_sig_rets.loc[[ticker]].index.get_level_values(1).min()
    init_cap = df_sig_rets.loc[(ticker, init_date)]['Market-Cap']
    if init_cap >= 2.0e11:
        simfin_mega.append(ticker)
    elif (init_cap >= 1.0e10) & (init_cap < 2.0e11):   
        simfin_large.append(ticker)
    elif (init_cap >= 2.0e9) & (init_cap < 1.0e10):  
        simfin_mid.append(ticker)
    elif (init_cap >= 3.0e08) & (init_cap < 2.0e09):  
        simfin_small.append(ticker)
    elif (init_cap >= 5.0e07) & (init_cap < 3.0e08):  
        simfin_micro.append(ticker)

In [20]:
#create cap segmented dataframes
if simfin_mega:
    df_sig_rets_mega = df_sig_rets.loc[simfin_mega]
if simfin_large:
    df_sig_rets_large = df_sig_rets.loc[simfin_large]
if simfin_mid:
    df_sig_rets_mid = df_sig_rets.loc[simfin_mid]
if simfin_small:
    df_sig_rets_small = df_sig_rets.loc[simfin_small]
if simfin_micro:
    df_sig_rets_micro = df_sig_rets.loc[simfin_micro]

In [24]:
df_sig_rets_micro

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAOI,2014-12-01,2.188082,0.700560,3.347826,2.838472,0.0,0.328195,24.650350,8.071153,0.025976,1.769900,...,-0.186366,1.930558e+08,1.696135,4.290891,63.090117,-5.365790,3.410698,7.612834,1.638816,0.442427
AAOI,2014-12-02,2.188082,0.700560,3.347826,2.838472,0.0,0.328195,24.650350,8.071153,0.025976,1.769900,...,-0.191398,1.879803e+08,1.651543,4.178083,61.431467,-5.224722,3.321031,7.412691,1.595731,0.457723
AAOI,2014-12-03,2.188082,0.700560,3.347826,2.838472,0.0,0.328195,24.650350,8.071153,0.025976,1.769900,...,-0.188569,1.908000e+08,1.676316,4.240754,62.352939,-5.303093,3.370846,7.523882,1.619667,0.450279
AAOI,2014-12-04,2.188082,0.700560,3.347826,2.838472,0.0,0.328195,24.650350,8.071153,0.025976,1.769900,...,-0.191589,1.877923e+08,1.649892,4.173904,61.370036,-5.219498,3.317709,7.405279,1.594135,0.459732
AAOI,2014-12-05,2.188082,0.700560,3.347826,2.838472,0.0,0.328195,24.650350,8.071153,0.025976,1.769900,...,-0.188755,1.906120e+08,1.674665,4.236576,62.291508,-5.297869,3.367525,7.516469,1.618071,0.452311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZGNX,2016-06-16,0.000000,0.122127,0.209838,7.868594,0.0,0.174262,-17.744338,7.502072,0.275968,6.495311,...,-0.244200,2.007367e+08,1.154595,1.809046,5.198278,-3.116642,3.990392,7.209463,6.317640,0.482756
ZGNX,2016-06-17,0.000000,0.122127,0.209838,7.868594,0.0,0.174262,-17.744338,7.502072,0.275968,6.495311,...,-0.244200,1.888615e+08,1.086291,1.809046,4.890757,-2.932267,3.754328,6.782965,5.943900,0.482756
ZGNX,2016-06-20,0.000000,0.122127,0.209838,7.868594,0.0,0.174262,-17.744338,7.502072,0.275968,6.495311,...,-0.244200,1.886331e+08,1.084978,1.809046,4.884843,-2.928722,3.749788,6.774763,5.936713,0.482756
ZGNX,2016-06-21,0.000000,0.122127,0.209838,7.868594,0.0,0.174262,-17.744338,7.502072,0.275968,6.495311,...,-0.244200,1.874913e+08,1.078410,1.809046,4.855274,-2.910993,3.727090,6.733753,5.900776,0.482756


## Sector Segmented Simfin

In [None]:
d_sf = {}
#create dictionary which holds all simfin tickers per industry (from finhub data)
for cap, industry in d.items():
    for i, tlist in industry.items():
        if len(tlist) == 0:
            continue
        try: 
            d_sf[i]
        except KeyError:
            d_sf[i] = list()
        for ticker in tlist:
            if ticker not in simfin_tickers:
                continue
            d_sf[i].append(ticker)
d_sf

In [None]:
#create segmented values for simfin data per sector
d_simfin_sectors = {}
for k in d_sf.keys():
    d_simfin_sectors[k] = df_sig_rets.loc[d_sf[k]]

In [None]:
d_sf['Tobacco']

In [None]:
df_sig_rets.loc[d_sf['Tobacco']]

In [None]:
d_simfin_sectors['Tobacco']