In [3]:
import pandas as pd
import numpy as np
import simfin as sf
from simfin.names import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score, accuracy_score

### Data Pulling (from SimFin) --> api_calls.py

In [3]:
API_KEY = 'MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg'

# SimFin data-directory.
sf.set_data_dir('~/simfin_data/')
# SimFin load API key or use free data.
sf.load_api_key('MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg')

In [4]:
market = 'us'

# Add this date-offset to the fundamental data such as
# Income Statements etc., because the REPORT_DATE is not
# when it was actually made available to the public,
# which can be 1, 2 or even 3 months after the Report Date.
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

In [5]:
hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

df_fin_signals = hub.fin_signals(variant='daily')
df_growth_signals = hub.growth_signals(variant='daily')
df_val_signals = hub.val_signals(variant='daily')

Dataset "us-income-ttm" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-2a38bb7d.pickle' on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-2a38bb7d.pickle' on disk (0 days old).
- Loading from disk ... Done!
Cache-file 'val_signals-739b68a6.pickle' on disk (0 days old).
- Loading from disk ... Done!


### Data Cleaning/Manipulation

In [6]:
# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Remove all rows with only NaN values.
df = df_signals.dropna(how='all').reset_index(drop=True)

# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)



# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

Cache-file 'mean_log_change-5cec82bd.pickle' on disk (0 days old).
- Loading from disk ... Done!


In [7]:
df_sig_rets

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2015-04-01,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.024232,1.398982e+10,3.299486,6.605203,33.548729,41.267906,35.621788,-31.997432,3.440684,0.131031
A,2015-04-02,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023852,1.421290e+10,3.352099,6.710529,34.083693,41.925959,35.621788,-31.997432,3.495548,0.122948
A,2015-04-06,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023920,1.417234e+10,3.342533,6.691379,33.986427,41.806313,35.621788,-31.997432,3.485573,0.125254
A,2015-04-07,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023632,1.434472e+10,3.383189,6.772767,34.399808,42.314808,35.621788,-31.997432,3.527969,0.119174
A,2015-04-08,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023733,1.428388e+10,3.368840,6.744042,34.253909,42.135339,35.621788,-31.997432,3.513005,0.122120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
low,2016-06-16,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.441302,0.043098,0.152747,...,0.050179,5.166619e+10,9.487840,101.994284,28.523146,19.928631,-5.564319,-3.996474,1.229304,0.094778
low,2016-06-17,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.441302,0.043098,0.152747,...,0.050019,5.166619e+10,9.518184,102.320478,28.614368,19.992366,-5.582115,-4.009255,1.233236,0.093223
low,2016-06-20,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.441302,0.043098,0.152747,...,0.049911,5.166619e+10,9.538818,102.542289,28.676398,20.035705,-5.594216,-4.017947,1.235909,0.092387
low,2016-06-21,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.441302,0.043098,0.152747,...,0.050077,5.166619e+10,9.507260,102.203048,28.581528,19.969421,-5.575708,-4.004654,1.231821,0.094529


In [4]:
finhub_data = pd.read_csv('..\TOTAL_US_STOCK_MARKET_CLEAN.csv')
finhub_tickers = finhub_data['ticker']
finhub_sectors = finhub_data['finnhubIndustry']
finhub_data.columns

Index(['index', 'ticker', 'name', 'finnhubIndustry', 'country', 'currency',
       'exchange', 'ipo', 'marketCapitalization', 'marketCapClass',
       'shareOutstanding'],
      dtype='object')

In [5]:
finhub_tickers

0       AACG
1        AAL
2       AAME
3       AAOI
4       AAON
        ... 
4359    XTNT
4360    XXII
4361    YCBD
4362    ZDGE
4363     ZOM
Name: ticker, Length: 4364, dtype: object

In [None]:
finhub_data

In [None]:
df_sig_rets['Sector'] = ''

In [13]:
temp = df_sig_rets.loc[['AAMC']]
temp.iloc[:, df_sig_rets.columns.get_loc('Sector')] = 'test'
temp

KeyError: 'Sector'

In [15]:
fin_ticker_list = finhub_tickers.tolist()

In [14]:
df_sig_rets.index.unique(level=0)

Index(['A', 'AAMC', 'AAN', 'AAOI', 'AAON', 'AAP', 'AAWW', 'ABAX', 'ABBV',
       'ABC',
       ...
       'XYL', 'YGYI', 'ZAYO', 'ZBH', 'ZEUS', 'ZGNX', 'ZNGA', 'ZTS', 'ZUMZ',
       'low'],
      dtype='object', name='Ticker', length=1045)

In [None]:
df_sig_rets = df_sig_rets[df_sig_rets.index.isin(fin_ticker_list, level=0)]

In [None]:
df_sig_rets.to_csv('simfin_total.csv')

In [None]:
 for i in df_sig_rets.index.unique(level=0):
        if i in fin_ticker_list:
            print(i)
            fintemp = finhub_data.loc[finhub_data['ticker'] == i]
            df_sig_rets.loc[[i]].iloc[:, df_sig_rets.columns.get_loc('Sector')] = fintemp['finnhubIndustry']   
df_sig_rets

In [None]:
'A' in fin_ticker_list

In [None]:
df_sig_rets[(df_sig_rets['Market-Cap'] >= 3.0e8) & (df_sig_rets['Market-Cap'] < 2.0e9)]

In [None]:
df_sig_rets_large = df_sig_rets[(df_sig_rets['Market-Cap'] >= 1.0e10) & (df_sig_rets['Market-Cap'] < 2.0e11)]
df_sig_rets_mid = df_sig_rets[(df_sig_rets['Market-Cap'] >= 2.0e9) & (df_sig_rets['Market-Cap'] < 1.0e10)]
df_sig_rets_small = df_sig_rets[(df_sig_rets['Market-Cap'] >= 3.0e8) & (df_sig_rets['Market-Cap'] < 2.0e9)]
df_sig_rets_micro = df_sig_rets[(df_sig_rets['Market-Cap'] >= 5.0e7) & (df_sig_rets['Market-Cap'] < 3.0e8)]

### Feature Engineering

In [None]:
# Split the tickers into training- and test-sets.
tickers_train, tickers_test = \
    train_test_split(tickers, train_size=0.8, random_state=1234)

df_train = df_sig_rets.loc[tickers_train]
df_test = df_sig_rets.loc[tickers_test]

# DataFrames with signals for training- and test-sets.
X_train = df_train.drop(columns=[TOTAL_RETURN_1_3Y])
X_test = df_test.drop(columns=[TOTAL_RETURN_1_3Y])

# DataFrames with stock-returns for training- and test-sets.
y_train = df_train[TOTAL_RETURN_1_3Y]
y_test = df_test[TOTAL_RETURN_1_3Y]

# List of signal names.
signal_names = X_train.columns.values

# List of signal names where spaces are replaced with _
signal_names_ = [s.replace(' ', '_') for s in signal_names]

# Column-name.
FEATURE_IMPORTANCE = 'Feature Importance'

def compare_feature_imp_corr(estimator):
    """
    Return a DataFrame which compares the signals' Feature
    Importance in the Machine Learning model, to the absolute
    correlation of the signals and stock-returns.

    :param estimator: Sklearn ensemble estimator.
    :return: Pandas DataFrame.
    """

    # Wrap the list of Feature Importance in a Pandas Series.
    df_feat_imp = pd.Series(estimator.feature_importances_,
                            index=signal_names,
                            name=FEATURE_IMPORTANCE)

    # Concatenate the DataFrames with Feature Importance
    # and Return Correlation.
    dfs = [df_feat_imp, df_corr_returns]
    df_compare = pd.concat(dfs, axis=1, sort=True)

    # Sort by Feature Importance.
    df_compare.sort_values(by=FEATURE_IMPORTANCE,
                           ascending=False, inplace=True)

    return df_compare

def print_tree(estimator, max_depth=6, **kwargs):
    """
    Print the first Decision Tree from a Random Forest.
    :param estimator: Sklearn ensemble estimator.
    """
    s = export_text(estimator.estimators_[0],
                    max_depth=max_depth,
                    feature_names=signal_names_,
                    **kwargs)
    print(s)

# Parameters for scikit-learn's Random Forest models.
model_args = \
{
    # Random Forest parameters to adjust between
    # over- and under-fitting.
    'n_estimators': 100,
    'max_depth': 15,
    'min_samples_split': 100,
    'min_samples_leaf': 10,

    # Use all available CPU cores.
    'n_jobs': -1,

    # Set random seed to make the experiments repeatable.
    'random_state': 1234,
}


### Training 🏋🏿🏋🏿🏋🏿🏋🏿🏋🏿

In [None]:
# Create the estimator, but don't do any computations yet.
regr = RandomForestRegressor(**model_args)

# Fit the estimator to the training-data.
# This may take several minutes on a 4-core CPU.
_ = regr.fit(X=X_train, y=y_train)

print_tree(regr)

### Testing 💯💯💯💯💯