In [1]:
import pandas as pd
import numpy as np
import simfin as sf
from simfin.names import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score, accuracy_score

### Data Pulling (from SimFin) --> api_calls.py

In [4]:
API_KEY = 'MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg'

# SimFin data-directory.
sf.set_data_dir('~/simfin_data/')
# SimFin load API key or use free data.
sf.load_api_key('MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg')

In [5]:
market = 'us'

# Add this date-offset to the fundamental data such as
# Income Statements etc., because the REPORT_DATE is not
# when it was actually made available to the public,
# which can be 1, 2 or even 3 months after the Report Date.
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

In [6]:
hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

df_fin_signals = hub.fin_signals(variant='daily')
df_growth_signals = hub.growth_signals(variant='daily')
df_val_signals = hub.val_signals(variant='daily')

Dataset "us-income-ttm" on disk (9 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (9 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (9 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (9 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-2a38bb7d.pickle' on disk (9 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (8 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (8 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (8 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-2a38bb7d.pickle' on disk (8 days old).
- Loading from disk ... Done!
Cache-file 'val_signals-739b68a6.pickle' on disk (8 days old).
- Loading from disk ... Done!


In [11]:
df_fin_signals.reset_index()

Unnamed: 0,Ticker,Date,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Debt Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,...,Log Revenue,Net Acquisitions / Total Assets,Net Profit Margin,Quick Ratio,R&D / Gross Profit,R&D / Revenue,Return on Assets,Return on Equity,Return on Research Capital,Share Buyback / FCF
0,A,2007-01-03,,,,,,,,,...,,,,,,,,,,
1,A,2007-01-04,,,,,,,,,...,,,,,,,,,,
2,A,2007-01-05,,,,,,,,,...,,,,,,,,,,
3,A,2007-01-08,,,,,,,,,...,,,,,,,,,,
4,A,2007-01-09,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5287537,low,2019-05-29,0.859274,2.066448,0.683261,0.981444,0.470123,0.285574,0.32125,6.439103,...,10.853144,0.003998,0.03245,0.050286,,,0.067057,0.635016,,0.5737
5287538,low,2019-05-30,0.859274,2.066448,0.683261,0.981444,0.470123,0.285574,0.32125,6.439103,...,10.853144,0.003998,0.03245,0.050286,,,0.067057,0.635016,,0.5737
5287539,low,2019-05-31,0.859274,2.066448,0.683261,0.981444,0.470123,0.285574,0.32125,6.439103,...,10.853144,0.003998,0.03245,0.050286,,,0.067057,0.635016,,0.5737
5287540,low,2019-06-03,0.859274,2.066448,0.683261,0.981444,0.470123,0.285574,0.32125,6.439103,...,10.853144,0.003998,0.03245,0.050286,,,0.067057,0.635016,,0.5737


In [12]:
df_growth_signals.reset_index()

Unnamed: 0,Ticker,Date,Assets Growth,Assets Growth QOQ,Assets Growth YOY,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,FCF Growth QOQ,FCF Growth YOY,Sales Growth,Sales Growth QOQ,Sales Growth YOY
0,A,2007-01-03,,,,,,,,,,,,
1,A,2007-01-04,,,,,,,,,,,,
2,A,2007-01-05,,,,,,,,,,,,
3,A,2007-01-08,,,,,,,,,,,,
4,A,2007-01-09,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5287537,low,2019-05-29,-0.022187,-0.06006,-0.022187,-0.328692,-2.308426,-2.48556,0.277903,-2.245879,0.465267,0.039202,-0.101579,0.00981
5287538,low,2019-05-30,-0.022187,-0.06006,-0.022187,-0.328692,-2.308426,-2.48556,0.277903,-2.245879,0.465267,0.039202,-0.101579,0.00981
5287539,low,2019-05-31,-0.022187,-0.06006,-0.022187,-0.328692,-2.308426,-2.48556,0.277903,-2.245879,0.465267,0.039202,-0.101579,0.00981
5287540,low,2019-06-03,-0.022187,-0.06006,-0.022187,-0.328692,-2.308426,-2.48556,0.277903,-2.245879,0.465267,0.039202,-0.101579,0.00981


In [13]:
df_val_signals.reset_index()

Unnamed: 0,Ticker,Date,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
0,A,2007-01-03,,,,,,,,,,,
1,A,2007-01-04,,,,,,,,,,,
2,A,2007-01-05,,,,,,,,,,,
3,A,2007-01-08,,,,,,,,,,,
4,A,2007-01-09,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5287537,low,2019-05-29,0.019220,0.030567,0.067303,7.570276e+10,20.774632,103.844664,32.715108,14.858245,-4.550539,-3.173521,1.061616
5287538,low,2019-05-30,0.019087,0.030355,0.066837,7.623056e+10,20.919473,104.568669,32.943198,14.961837,-4.582265,-3.195647,1.069017
5287539,low,2019-05-31,0.019210,0.030551,0.067267,7.574336e+10,20.785774,103.900357,32.732653,14.866214,-4.552979,-3.175223,1.062185
5287540,low,2019-06-03,0.019095,0.030368,0.066865,7.619808e+10,20.910560,104.524115,32.929162,14.955462,-4.580313,-3.194285,1.068562


### Data Cleaning/Manipulation

In [14]:
# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Remove all rows with only NaN values.
df = df_signals.dropna(how='all').reset_index(drop=True)

# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)

# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

Cache-file 'mean_log_change-5cec82bd.pickle' on disk (8 days old).
- Loading from disk ... Done!


In [17]:
df_sig_rets

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2015-04-01,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.024232,1.398982e+10,3.299486,6.605203,33.548729,41.267906,35.919087,-31.834869,3.440684,0.131031
A,2015-04-02,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023852,1.421290e+10,3.352099,6.710529,34.083693,41.925959,35.919087,-31.834869,3.495548,0.122948
A,2015-04-06,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023920,1.417234e+10,3.342533,6.691379,33.986427,41.806313,35.919087,-31.834869,3.485573,0.125254
A,2015-04-07,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023632,1.434472e+10,3.383189,6.772767,34.399808,42.314808,35.919087,-31.834869,3.527969,0.119174
A,2015-04-08,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,0.023733,1.428388e+10,3.368840,6.744042,34.253909,42.135339,35.919087,-31.834869,3.513005,0.122120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
low,2016-05-26,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.472903,0.043098,0.143682,...,0.048933,5.609631e+10,9.729375,104.590787,29.249269,20.435960,-5.705972,-4.098214,1.260599,0.076869
low,2016-05-27,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.472903,0.043098,0.143682,...,0.048818,5.609631e+10,9.752437,104.838694,29.318598,20.484399,-5.719497,-4.107927,1.263587,0.075875
low,2016-05-31,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.472903,0.043098,0.143682,...,0.048952,5.609631e+10,9.725734,104.551643,29.238323,20.428312,-5.703836,-4.096680,1.260127,0.077629
low,2016-06-01,1.305434,1.889401,0.718336,1.006576,0.262623,0.348207,9.005435,10.472903,0.043098,0.143682,...,0.048866,5.609631e+10,9.742727,104.734312,29.289407,20.464004,-5.713802,-4.103837,1.262329,0.076905


### Feature Engineering

In [18]:
# Split the tickers into training- and test-sets.
tickers_train, tickers_test = \
    train_test_split(tickers, train_size=0.8, random_state=1234)

df_train = df_sig_rets.loc[tickers_train]
df_test = df_sig_rets.loc[tickers_test]

# DataFrames with signals for training- and test-sets.
X_train = df_train.drop(columns=[TOTAL_RETURN_1_3Y])
X_test = df_test.drop(columns=[TOTAL_RETURN_1_3Y])

# DataFrames with stock-returns for training- and test-sets.
y_train = df_train[TOTAL_RETURN_1_3Y]
y_test = df_test[TOTAL_RETURN_1_3Y]

# List of signal names.
signal_names = X_train.columns.values

# List of signal names where spaces are replaced with _
signal_names_ = [s.replace(' ', '_') for s in signal_names]

# Column-name.
FEATURE_IMPORTANCE = 'Feature Importance'

def compare_feature_imp_corr(estimator):
    """
    Return a DataFrame which compares the signals' Feature
    Importance in the Machine Learning model, to the absolute
    correlation of the signals and stock-returns.

    :param estimator: Sklearn ensemble estimator.
    :return: Pandas DataFrame.
    """

    # Wrap the list of Feature Importance in a Pandas Series.
    df_feat_imp = pd.Series(estimator.feature_importances_,
                            index=signal_names,
                            name=FEATURE_IMPORTANCE)

    # Concatenate the DataFrames with Feature Importance
    # and Return Correlation.
    dfs = [df_feat_imp, df_corr_returns]
    df_compare = pd.concat(dfs, axis=1, sort=True)

    # Sort by Feature Importance.
    df_compare.sort_values(by=FEATURE_IMPORTANCE,
                           ascending=False, inplace=True)

    return df_compare

def print_tree(estimator, max_depth=6, **kwargs):
    """
    Print the first Decision Tree from a Random Forest.
    :param estimator: Sklearn ensemble estimator.
    """
    s = export_text(estimator.estimators_[0],
                    max_depth=max_depth,
                    feature_names=signal_names_,
                    **kwargs)
    print(s)

# Parameters for scikit-learn's Random Forest models.
model_args = \
{
    # Random Forest parameters to adjust between
    # over- and under-fitting.
    'n_estimators': 100,
    'max_depth': 15,
    'min_samples_split': 100,
    'min_samples_leaf': 10,

    # Use all available CPU cores.
    'n_jobs': -1,

    # Set random seed to make the experiments repeatable.
    'random_state': 1234,
}


### Training 🏋🏿🏋🏿🏋🏿🏋🏿🏋🏿

In [None]:
# Create the estimator, but don't do any computations yet.
regr = RandomForestRegressor(**model_args)

# Fit the estimator to the training-data.
# This may take several minutes on a 4-core CPU.
_ = regr.fit(X=X_train, y=y_train)

print_tree(regr)

### Testing 💯💯💯💯💯