In [3]:
import pandas as pd
import numpy as np
import simfin as sf
from simfin.names import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score, accuracy_score

### Data Pulling (from SimFin) --> api_calls.py

In [4]:
API_KEY = 'MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg'

# SimFin data-directory.
sf.set_data_dir('~/simfin_data/')
# SimFin load API key or use free data.
sf.load_api_key('MbOGeJgi6qQjgYbb58oBVQDaObxEZzXg')

In [5]:
market = 'us'

# Add this date-offset to the fundamental data such as
# Income Statements etc., because the REPORT_DATE is not
# when it was actually made available to the public,
# which can be 1, 2 or even 3 months after the Report Date.
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

In [6]:
hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

df_fin_signals = hub.fin_signals(variant='daily')
df_growth_signals = hub.growth_signals(variant='daily')
df_val_signals = hub.val_signals(variant='daily')

Dataset "us-income-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (20 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (5 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-2a38bb7d.pickle' on disk (11 days old).
- Running function fin_signals() ... 

  result = getattr(ufunc, method)(*inputs, **kwargs)


Done!
- Saving cache-file to disk ... Done!
Dataset "us-income-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (11 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-2a38bb7d.pickle' on disk (11 days old).
- Running function growth_signals() ... Done!
- Saving cache-file to disk ... Done!
Cache-file 'val_signals-739b68a6.pickle' on disk (11 days old).
- Running function val_signals() ... Done!
- Saving cache-file to disk ... Done!


### Data Cleaning/Manipulation

In [7]:
# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Remove all rows with only NaN values.
df = df_signals.dropna(how='all').reset_index(drop=True)

# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)



# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

Cache-file 'mean_log_change-5cec82bd.pickle' on disk (11 days old).
- Running function mean_log_change() ... Done!
- Saving cache-file to disk ... Done!


                   (Dividends + Share Buyback) / FCF  Asset Turnover  \
Ticker Date                                                            
MHH    2016-03-10                           0.066107        2.464567   

                   CapEx / (Depr + Amor)  Current Ratio  Dividends / FCF  \
Ticker Date                                                                
MHH    2016-03-10               0.254545       1.746146             -0.0   

                   Gross Profit Margin  Interest Coverage  Log Revenue  \
Ticker Date                                                              
MHH    2016-03-10             0.192751          15.979522     8.091561   

                   Net Profit Margin  Quick Ratio  ...  FCF Yield  Market-Cap  \
Ticker Date                                        ...                          
MHH    2016-03-10           0.022297     1.444538  ...   0.082114  34817440.0   

                     P/Book     P/Cash       P/E      P/FCF     P/NCAV  \
Ticker Date  

In [106]:
finhub_data = pd.read_csv('TOTAL_US_STOCK_MARKET.csv')
finhub_tickers = finhub_data['ticker']
finhub_sectors = finhub_data['finnhubIndustry']
finhub_data.columns

Index(['ticker', 'name', 'finnhubIndustry', 'country', 'currency', 'exchange',
       'ipo', 'marketCapitalization', 'marketCapClass', 'shareOutstanding'],
      dtype='object')

In [109]:
finhub_tickers

0       AACG
1        AAL
2       AAME
3       AAOI
4       AAON
        ... 
5480    XTNT
5481    XXII
5482    YCBD
5483    ZDGE
5484     ZOM
Name: ticker, Length: 5485, dtype: object

In [160]:
finhub_data

Unnamed: 0,ticker,name,finnhubIndustry,country,currency,exchange,ipo,marketCapitalization,marketCapClass,shareOutstanding
0,AACG,ATA Creativity Global,Diversified Consumer Services,CN,USD,NASDAQ NMS - GLOBAL MARKET,2008-01-29,27.29185,MICRO_CAP,56.626346
1,AAL,American Airlines Group Inc,Airlines,US,USD,NASDAQ NMS - GLOBAL MARKET,2013-12-09,7844.693,MID_CAP,422.894501
2,AAME,Atlantic American Corp,Insurance,US,USD,NASDAQ NMS - GLOBAL MARKET,1980-01-16,36.78906,MICRO_CAP,20.438366
3,AAOI,Applied Optoelectronics Inc,Communications,US,USD,NASDAQ NMS - GLOBAL MARKET,2013-09-26,197.1919,MICRO_CAP,20.339549
4,AAON,Aaon Inc,Building,US,USD,NASDAQ NMS - GLOBAL MARKET,1991-01-03,2980.886,MID_CAP,52.031532
...,...,...,...,...,...,...,...,...,...,...
5480,XTNT,Xtant Medical Holdings Inc,Health Care,US,USD,NYSE MKT LLC,2010-06-30,16.92616,MICRO_CAP,13.223565
5481,XXII,22nd Century Group Inc,Tobacco,US,USD,NYSE MKT LLC,2006-10-17,135.6744,MICRO_CAP,138.854193
5482,YCBD,cbdMD Inc,Pharmaceuticals,US,USD,NYSE MKT LLC,2017-11-17,84.70382,MICRO_CAP,51.335648
5483,ZDGE,Zedge Inc,Media,US,USD,NYSE MKT LLC,2016-05-26,13.54143,MICRO_CAP,12.199482


In [182]:
temp = df_sig_rets.loc[['AAMC']]
temp.iloc[:, df_sig_rets.columns.get_loc('Sector')] = 'test'
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years,Sector
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAMC,2015-03-02,0.114269,0.15333,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,5.466123e+08,0.390794,4.680621,9.159206,-3.691331,-0.835698,-0.441923,1.291318,-0.517222,test
AAMC,2015-03-03,0.114269,0.15333,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,5.749342e+08,0.411043,4.923140,9.633777,-3.882592,-0.878998,-0.464821,1.358226,-0.517222,test
AAMC,2015-03-04,0.114269,0.15333,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,5.767751e+08,0.412359,4.938904,9.664624,-3.895024,-0.881813,-0.466309,1.362575,-0.517222,test
AAMC,2015-03-05,0.114269,0.15333,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,5.759538e+08,0.411772,4.931871,9.650861,-3.889477,-0.880557,-0.465645,1.360634,-0.517222,test
AAMC,2015-03-06,0.114269,0.15333,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,5.529847e+08,0.395350,4.735188,9.265985,-3.734365,-0.845440,-0.447075,1.306372,-0.517222,test
AAMC,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AAMC,2016-06-07,-0.068644,2.40273,3.342736,7.879346,0.0,0.884735,2.693211,8.214075,-0.067913,6.467805,...,4.496712e+07,0.696646,1.798597,-3.970957,-0.271719,0.722607,0.781317,0.274675,0.484233,test
AAMC,2016-06-08,-0.068644,2.40273,3.342736,7.879346,0.0,0.884735,2.693211,8.214075,-0.067913,6.467805,...,4.480624e+07,0.694154,1.798597,-3.956750,-0.270747,0.720022,0.778521,0.273693,0.484233,test
AAMC,2016-06-09,-0.068644,2.40273,3.342736,7.879346,0.0,0.884735,2.693211,8.214075,-0.067913,6.467805,...,4.154833e+07,0.643681,1.798597,-3.669051,-0.251061,0.667668,0.721914,0.253792,0.484233,test
AAMC,2016-06-10,-0.068644,2.40273,3.342736,7.879346,0.0,0.884735,2.693211,8.214075,-0.067913,6.467805,...,3.806921e+07,0.589781,1.798597,-3.361817,-0.230038,0.611760,0.661464,0.238868,0.484233,test


In [189]:
fin_ticker_list = finhub_tickers.tolist()

In [151]:
df_sig_rets.index.unique(level=0)

Index(['A', 'AAMC', 'AAN', 'AAOI', 'AAON', 'AAP', 'AAWW', 'ABBV', 'ABC', 'ABG',
       ...
       'XPO', 'XRAY', 'XYL', 'YGYI', 'ZBH', 'ZEUS', 'ZGNX', 'ZNGA', 'ZTS',
       'ZUMZ'],
      dtype='object', name='Ticker', length=855)

In [61]:
df_sig_rets = df_sig_rets[df_sig_rets.index.isin(fin_ticker_list, level=0)]

In [62]:
df_sig_rets.to_csv('simfin_total.csv')

In [175]:
df_sig_rets['Sector'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [191]:
 for i in df_sig_rets.index.unique(level=0):
        if i in fin_ticker_list:
            print(i)
            fintemp = finhub_data.loc[finhub_data['ticker'] == i]
            df_sig_rets.loc[[i]].iloc[:, df_sig_rets.columns.get_loc('Sector')] = fintemp['finnhubIndustry']   
df_sig_rets

A
AAMC
AAN
AAOI
AAON
AAP

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s



AAWW
ABBV
ABC
ABG
ABM
ABT
ACAD
ACCO
ACHC
ACLS
ACN
ACRX
ADBE
ADI
ADM
ADP
ADS
ADSK
AE
AEE
AEHR
AEP
AES
AGCO
AGIO
AHPI
AIMC
AIR
AIRI
AIT
AJG
AJRD
AKAM
AKRX
AL
ALB
ALE
ALG
ALGT
ALK
ALKS
ALLE
ALNY
ALSK
ALSN
ALV
ALXN
AMAT
AMCX
AMD
AME
AMED
AMG
AMGN
AMH
AMKR
AMOT
AMP
AMRK
AMSC
AMSWA
AMT
AMTD
AMWD
AMZN
AN
ANDE
ANF
ANIK
ANSS
AOBC
AOS
AOSL
APA
APD
APH
APOG
ARCB
ARE
ARW
ASNA
ATI
ATR
ATRO
ATVI
AVA
AVAV
AVB
AVEO
AVNW
AVY
AWK
AWRE
AWX
AXGN
AXL
AYI
AZO
AZZ
BA
BAH
BAX
BBBY
BBGI
BBY
BC
BCO
BCPC
BDX
BFAM
BG
BGG
BHE
BIG
BJRI
BKNG
BLDR
BLFS
BLK
BLKB
BLL
BMI
BMRN
BNFT
BOOM
BRC
BREW
BRO
BRX
BSTC
BURL
BWXT
BX
CACI
CAH
CAKE
CAL
CALM
CASY
CAT
CATS
CBB
CBRL
CCF
CCI
CCK
CCL
CCMP
CCXI
CDE
CDW
CE
CENT
CERN
CF
CGA
CHD
CHDN
CHE
CHEF
CHGG
CHK
CHRW
CHTR
CIDM
CIK
CL
CLAR
CLF
CLFD
CLH
CLNE
CLR
CLX
CMCO
CMD
CME
CMI
CMLS
CMP
CMPR
CMS
CMTL
CNA
CNC
CNP
CNSL
COG
COHR
COKE
COLM
COMM
CONN
COP
COR
CORT
COTY
CPRI
CPRT
CPS
CPST
CPT
CQP
CR
CRIS
CRL
CRM
CRWS
CRY
CSGP
CSGS
CSII
CSL
CSU
CSX
CTAS
CTIC
CTL
CTSH
CTXS
CUB
CUBE
CVCO
CVGI

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years,Sector
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2015-04-01,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,1.398982e+10,3.299486,6.605203,33.548729,41.267906,35.842805,-32.282307,3.440684,0.131031,
A,2015-04-02,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,1.421290e+10,3.352099,6.710529,34.083693,41.925959,35.842805,-32.282307,3.495548,0.122948,
A,2015-04-06,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,1.417234e+10,3.342533,6.691379,33.986427,41.806313,35.842805,-32.282307,3.485573,0.125254,
A,2015-04-07,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,1.434472e+10,3.383189,6.772767,34.399808,42.314808,35.842805,-32.282307,3.527969,0.119174,
A,2015-04-08,0.439528,0.540763,0.500000,3.937500,0.489676,0.486719,4.659091,9.609167,0.102558,2.945043,...,1.428388e+10,3.368840,6.744042,34.253909,42.135339,35.842805,-32.282307,3.513005,0.122120,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZUMZ,2016-06-07,2.455026,1.939216,1.145478,2.876645,-0.000000,0.334033,-87.268431,8.905355,0.035794,1.278441,...,4.440061e+08,1.495186,5.876672,15.424912,32.237427,5.470818,26.754607,0.552121,0.140357,
ZUMZ,2016-06-08,2.455026,1.939216,1.145478,2.876645,-0.000000,0.334033,-87.268431,8.905355,0.035794,1.278441,...,4.548426e+08,1.531678,6.020100,15.801377,33.024224,5.604340,27.407589,0.565596,0.127947,
ZUMZ,2016-06-09,2.455026,1.939216,1.145478,2.876645,-0.000000,0.334033,-87.268431,8.905355,0.035794,1.278441,...,4.530854e+08,1.525761,5.996841,15.740329,32.896635,5.582688,27.301700,0.563411,0.130825,
ZUMZ,2016-06-10,2.455026,1.939216,1.145478,2.876645,-0.000000,0.334033,-87.268431,8.905355,0.035794,1.278441,...,4.401986e+08,1.482365,5.826278,15.292640,31.960985,5.423904,26.525181,0.547386,0.147434,


In [192]:
'A' in fin_ticker_list

True

In [90]:
df_sig_rets[(df_sig_rets['Market-Cap'] >= 3.0e8) & (df_sig_rets['Market-Cap'] < 2.0e9)]

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Total Return 1-3 Years
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAMC,2015-03-02,0.114269,0.153330,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,-0.245891,5.466123e+08,0.390794,4.680621,9.159206,-3.691331,-0.835698,-0.441923,1.291318,-0.517222
AAMC,2015-03-03,0.114269,0.153330,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,-0.245891,5.749342e+08,0.411043,4.923140,9.633777,-3.882592,-0.878998,-0.464821,1.358226,-0.517222
AAMC,2015-03-04,0.114269,0.153330,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,-0.245891,5.767751e+08,0.412359,4.938904,9.664624,-3.895024,-0.881813,-0.466309,1.362575,-0.517222
AAMC,2015-03-05,0.114269,0.153330,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,-0.245891,5.759538e+08,0.411772,4.931871,9.650861,-3.889477,-0.880557,-0.465645,1.360634,-0.517222
AAMC,2015-03-06,0.114269,0.153330,3.342736,0.519758,0.0,0.987675,9.220243,8.626646,0.140986,0.149834,...,-0.245891,5.529847e+08,0.395350,4.735188,9.265985,-3.734365,-0.845440,-0.447075,1.306372,-0.517222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZUMZ,2016-06-07,2.455026,1.939216,1.145478,2.876645,-0.0,0.334033,-87.268431,8.905355,0.035794,1.278441,...,0.031020,4.440061e+08,1.495186,5.876672,15.424912,32.237427,5.470818,26.754607,0.552121,0.140357
ZUMZ,2016-06-08,2.455026,1.939216,1.145478,2.876645,-0.0,0.334033,-87.268431,8.905355,0.035794,1.278441,...,0.030281,4.548426e+08,1.531678,6.020100,15.801377,33.024224,5.604340,27.407589,0.565596,0.127947
ZUMZ,2016-06-09,2.455026,1.939216,1.145478,2.876645,-0.0,0.334033,-87.268431,8.905355,0.035794,1.278441,...,0.030398,4.530854e+08,1.525761,5.996841,15.740329,32.896635,5.582688,27.301700,0.563411,0.130825
ZUMZ,2016-06-10,2.455026,1.939216,1.145478,2.876645,-0.0,0.334033,-87.268431,8.905355,0.035794,1.278441,...,0.031288,4.401986e+08,1.482365,5.826278,15.292640,31.960985,5.423904,26.525181,0.547386,0.147434


In [None]:
df_sig_rets_large = df_sig_rets[(df_sig_rets['Market-Cap'] >= 1.0e10) & (df_sig_rets['Market-Cap'] < 2.0e11)]
df_sig_rets_mid = df_sig_rets[(df_sig_rets['Market-Cap'] >= 2.0e9) & (df_sig_rets['Market-Cap'] < 1.0e10)]
df_sig_rets_small = df_sig_rets[(df_sig_rets['Market-Cap'] >= 3.0e8) & (df_sig_rets['Market-Cap'] < 2.0e9)]
df_sig_rets_micro = df_sig_rets[(df_sig_rets['Market-Cap'] >= 5.0e7) & (df_sig_rets['Market-Cap'] < 3.0e8)]

### Feature Engineering

In [None]:
# Split the tickers into training- and test-sets.
tickers_train, tickers_test = \
    train_test_split(tickers, train_size=0.8, random_state=1234)

df_train = df_sig_rets.loc[tickers_train]
df_test = df_sig_rets.loc[tickers_test]

# DataFrames with signals for training- and test-sets.
X_train = df_train.drop(columns=[TOTAL_RETURN_1_3Y])
X_test = df_test.drop(columns=[TOTAL_RETURN_1_3Y])

# DataFrames with stock-returns for training- and test-sets.
y_train = df_train[TOTAL_RETURN_1_3Y]
y_test = df_test[TOTAL_RETURN_1_3Y]

# List of signal names.
signal_names = X_train.columns.values

# List of signal names where spaces are replaced with _
signal_names_ = [s.replace(' ', '_') for s in signal_names]

# Column-name.
FEATURE_IMPORTANCE = 'Feature Importance'

def compare_feature_imp_corr(estimator):
    """
    Return a DataFrame which compares the signals' Feature
    Importance in the Machine Learning model, to the absolute
    correlation of the signals and stock-returns.

    :param estimator: Sklearn ensemble estimator.
    :return: Pandas DataFrame.
    """

    # Wrap the list of Feature Importance in a Pandas Series.
    df_feat_imp = pd.Series(estimator.feature_importances_,
                            index=signal_names,
                            name=FEATURE_IMPORTANCE)

    # Concatenate the DataFrames with Feature Importance
    # and Return Correlation.
    dfs = [df_feat_imp, df_corr_returns]
    df_compare = pd.concat(dfs, axis=1, sort=True)

    # Sort by Feature Importance.
    df_compare.sort_values(by=FEATURE_IMPORTANCE,
                           ascending=False, inplace=True)

    return df_compare

def print_tree(estimator, max_depth=6, **kwargs):
    """
    Print the first Decision Tree from a Random Forest.
    :param estimator: Sklearn ensemble estimator.
    """
    s = export_text(estimator.estimators_[0],
                    max_depth=max_depth,
                    feature_names=signal_names_,
                    **kwargs)
    print(s)

# Parameters for scikit-learn's Random Forest models.
model_args = \
{
    # Random Forest parameters to adjust between
    # over- and under-fitting.
    'n_estimators': 100,
    'max_depth': 15,
    'min_samples_split': 100,
    'min_samples_leaf': 10,

    # Use all available CPU cores.
    'n_jobs': -1,

    # Set random seed to make the experiments repeatable.
    'random_state': 1234,
}


### Training 🏋🏿🏋🏿🏋🏿🏋🏿🏋🏿

In [None]:
# Create the estimator, but don't do any computations yet.
regr = RandomForestRegressor(**model_args)

# Fit the estimator to the training-data.
# This may take several minutes on a 4-core CPU.
_ = regr.fit(X=X_train, y=y_train)

print_tree(regr)

### Testing 💯💯💯💯💯