**This notebook contains the complete process of:** 
- preparing data from F-F_Research_Data_Factors.CSV
- feature engineering
- functions to build models and a detailed illustration of how the models generate calculations of metrics (accuracy & sharpe ratio)
- one example of decision tree classifier model tuning
- more models will be tuned in separate notebooks (each team member is responsible for 1~2 models to prevent Anaconda from shutting down)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
import dateutil.relativedelta as dr
import itertools

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Data Preparation

In [3]:
# Preparation
first_ym = 197001

ffdata = pd.read_csv('F-F_Research_Data_Factors.CSV', skiprows=4, names=['YM','RMRF','SMB','HML','RF'])
ffdata = ffdata.apply(pd.to_numeric, errors='coerce')
ffdata = ffdata.loc[ffdata['YM']>=first_ym]
ffdata['YM'] =  pd.to_datetime(ffdata['YM'], format='%Y%m')+MonthEnd(1)
ffdata.set_index(keys=['YM'], inplace=True)
ffdata = ffdata / 100

portdata = pd.read_csv('32_Portfolios_ME_BEME_OP_2x4x4.CSV', skiprows=17)
portdata.rename(index=str, columns={"Unnamed: 0": "YM"}, inplace=True)

In [4]:
portrets = portdata.iloc[0:665]
portrets = portrets.apply(pd.to_numeric, errors='coerce')
portrets = portrets.loc[portrets['YM']>=first_ym]
portrets['YM'] =  pd.to_datetime(portrets['YM'], format='%Y%m')+MonthEnd(1)
portrets.set_index(keys=['YM'], inplace=True)
portrets = portrets / 100 
portrets.head()

Unnamed: 0_level_0,SMALL LoBM LoOP,ME1 BM1 OP2,ME1 BM1 OP3,SMALL LoBM HiOP,ME1 BM2 OP1,ME1 BM2 OP2,ME1 BM2 OP3,ME1 BM2 OP4,ME1 BM3 OP1,ME1 BM3 OP2,...,ME2 BM2 OP3,ME2 BM2 OP4,ME2 BM3 OP1,ME2 BM3 OP2,ME2 BM3 OP3,ME2 BM3 OP4,BIG HiBM LoOP,ME2 BM4 OP2,ME2 BM4 OP3,BIG HiBM HiOP
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-31,0.00525,-0.055267,-0.080378,-0.074667,-0.046597,-0.055237,-0.059101,-0.064194,-0.037632,-0.028414,...,-0.056665,-0.078469,-0.093642,-0.098962,-0.082321,-0.092406,-0.074885,-0.039723,-0.106174,0.0
1970-02-28,0.006626,0.026054,0.024249,0.03615,0.059013,0.044452,0.01941,0.026811,0.039799,0.050559,...,0.053307,0.071068,0.107644,0.051041,0.085532,0.024334,0.075317,0.087424,0.086895,-0.057692
1970-03-31,-0.078513,-0.048637,-0.046776,-0.052443,-0.032625,-0.043169,0.012225,-0.005835,-0.011449,0.000505,...,0.006217,0.024365,-0.006224,0.022494,0.015318,-0.013756,-0.001063,0.019179,0.041757,-0.122449
1970-04-30,-0.246584,-0.226923,-0.195043,-0.196252,-0.205095,-0.154062,-0.123937,-0.137835,-0.144881,-0.107708,...,-0.104506,-0.061944,-0.093943,-0.084881,-0.092346,-0.057525,-0.071338,-0.077387,-0.065935,-0.023256
1970-05-31,-0.085709,-0.13058,-0.127618,-0.111789,-0.146879,-0.117081,-0.093169,-0.085371,-0.112812,-0.090623,...,-0.063918,-0.042881,-0.017492,-0.013655,-0.044085,-0.007752,-0.038425,-0.041147,-0.054058,-0.190476


In [5]:
portsize = portdata.iloc[2115:2777]
portsize = portsize.apply(pd.to_numeric, errors='coerce')
portsize = portsize.loc[portsize['YM']>=first_ym]
portsize['YM'] =  pd.to_datetime(portsize['YM'], format='%Y%m')+MonthEnd(1)
portsize.set_index(keys=['YM'], inplace=True)
portsize = portsize / 100 
portsize.tail()

Unnamed: 0_level_0,SMALL LoBM LoOP,ME1 BM1 OP2,ME1 BM1 OP3,SMALL LoBM HiOP,ME1 BM2 OP1,ME1 BM2 OP2,ME1 BM2 OP3,ME1 BM2 OP4,ME1 BM3 OP1,ME1 BM3 OP2,...,ME2 BM2 OP3,ME2 BM2 OP4,ME2 BM3 OP1,ME2 BM3 OP2,ME2 BM3 OP3,ME2 BM3 OP4,BIG HiBM LoOP,ME2 BM4 OP2,ME2 BM4 OP3,BIG HiBM HiOP
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-30,5.995,10.7808,13.3822,12.8487,4.5975,10.1506,10.9408,10.9514,4.4891,8.634,...,353.6253,155.7726,233.8114,170.0829,319.7713,144.4061,237.2593,269.009,187.5738,55.0411
2018-05-31,5.9957,10.8474,13.2617,12.8333,4.7342,10.2743,11.0032,11.1907,4.7213,8.7424,...,354.0368,155.5235,237.4304,170.3345,317.8839,142.2875,238.551,269.3983,189.2742,56.996
2018-06-30,6.7322,12.1288,14.0761,13.3089,5.2599,10.854,11.8106,11.7275,5.1071,9.3454,...,351.3442,155.1857,242.8102,173.3665,321.9208,139.3957,237.2285,269.2485,197.1856,57.5312
2018-07-31,6.6986,14.4102,16.7482,13.3051,4.063,11.3486,12.1582,10.6283,3.9503,9.7987,...,313.5799,302.4272,185.8579,192.7499,224.5428,171.7021,237.9312,308.7582,322.6795,191.1257
2018-08-31,6.6186,15.012,17.3295,13.653,4.1491,11.6882,12.4993,10.793,4.0063,10.0072,...,323.5484,316.5978,190.8331,201.1355,234.0549,179.6073,245.5646,327.4615,319.6688,188.5668


In [6]:
portbm = portdata.iloc[2785:3447]
portbm = portbm.apply(pd.to_numeric, errors='coerce')
portbm = portbm.loc[portbm['YM']>=first_ym]
portbm['YM'] =  pd.to_datetime(portbm['YM'], format='%Y%m')+MonthEnd(1)
portbm.set_index(keys=['YM'], inplace=True)
portbm = portbm / 100 
portbm.tail()

Unnamed: 0_level_0,SMALL LoBM LoOP,ME1 BM1 OP2,ME1 BM1 OP3,SMALL LoBM HiOP,ME1 BM2 OP1,ME1 BM2 OP2,ME1 BM2 OP3,ME1 BM2 OP4,ME1 BM3 OP1,ME1 BM3 OP2,...,ME2 BM2 OP3,ME2 BM2 OP4,ME2 BM3 OP1,ME2 BM3 OP2,ME2 BM3 OP3,ME2 BM3 OP4,BIG HiBM LoOP,ME2 BM4 OP2,ME2 BM4 OP3,BIG HiBM HiOP
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-30,0.001655,0.002151,0.002287,0.001831,0.004023,0.004116,0.004062,0.003764,0.006261,0.006354,...,0.002674,0.00268,0.004589,0.004246,0.004261,0.004003,0.00853,0.006987,0.00754,0.008375
2018-05-31,0.001641,0.002152,0.00228,0.001834,0.004011,0.004105,0.004062,0.00376,0.00627,0.006357,...,0.002671,0.002681,0.004587,0.004249,0.004244,0.004013,0.008536,0.006982,0.00756,0.008635
2018-06-30,0.00164,0.002148,0.002245,0.001839,0.004003,0.004109,0.004057,0.00375,0.006279,0.006358,...,0.00266,0.002676,0.004586,0.004256,0.00424,0.004019,0.008511,0.006975,0.007636,0.008518
2018-07-31,0.001495,0.002208,0.002231,0.001784,0.004078,0.004211,0.004113,0.003866,0.006776,0.006598,...,0.002466,0.002431,0.004261,0.004194,0.004073,0.004009,0.007617,0.006669,0.006883,0.00719
2018-08-31,0.001498,0.002188,0.002233,0.00179,0.004076,0.004192,0.00411,0.00387,0.006784,0.006591,...,0.002466,0.002425,0.004256,0.004198,0.004057,0.004017,0.00763,0.006674,0.006862,0.007188


In [7]:
portop = portdata.iloc[4124:4786]
portop = portop.apply(pd.to_numeric, errors='coerce')
portop = portop.loc[portop['YM']>=first_ym]
portop['YM'] =  pd.to_datetime(portop['YM'], format='%Y%m')+MonthEnd(1)
portop.set_index(keys=['YM'], inplace=True)
portop = portop / 100 
portop.tail()

Unnamed: 0_level_0,SMALL LoBM LoOP,ME1 BM1 OP2,ME1 BM1 OP3,SMALL LoBM HiOP,ME1 BM2 OP1,ME1 BM2 OP2,ME1 BM2 OP3,ME1 BM2 OP4,ME1 BM3 OP1,ME1 BM3 OP2,...,ME2 BM2 OP3,ME2 BM2 OP4,ME2 BM3 OP1,ME2 BM3 OP2,ME2 BM3 OP3,ME2 BM3 OP4,BIG HiBM LoOP,ME2 BM4 OP2,ME2 BM4 OP3,BIG HiBM HiOP
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-30,-0.011363,0.001352,0.002552,0.010128,-0.00246,0.001484,0.002436,0.004535,-0.001609,0.001494,...,0.003365,0.005003,0.001147,0.002287,0.003251,0.005126,0.00128,0.00233,0.003396,0.004375
2018-05-31,-0.011275,0.001351,0.002545,0.010282,-0.002495,0.001483,0.002436,0.004531,-0.001474,0.001498,...,0.003367,0.004992,0.001137,0.002287,0.003254,0.005137,0.001272,0.00233,0.003397,0.004367
2018-06-30,-0.011513,0.00134,0.002536,0.009665,-0.002571,0.001476,0.002437,0.004464,-0.001515,0.0015,...,0.003378,0.004988,0.001133,0.002286,0.003255,0.005165,0.001269,0.002331,0.003415,0.00437
2018-07-31,-0.013733,0.001568,0.002491,0.008133,-0.001494,0.00153,0.002373,0.004178,-0.000288,0.001553,...,0.003377,0.005603,0.001388,0.002253,0.003147,0.004938,0.001209,0.002257,0.00286,0.005251
2018-08-31,-0.013984,0.001555,0.00249,0.00806,-0.001465,0.001533,0.002366,0.004182,-0.000252,0.001551,...,0.003378,0.005596,0.001394,0.002251,0.003138,0.004956,0.001208,0.002257,0.002859,0.005239


In [8]:
portrets = portrets.stack()
portrets.index.names = ['YM','PORT']
portrets = portrets - ffdata['RF']
portrets.head()

YM          PORT           
1970-01-31  SMALL LoBM LoOP   -0.000750
            ME1 BM1 OP2       -0.061267
            ME1 BM1 OP3       -0.086378
            SMALL LoBM HiOP   -0.080667
            ME1 BM2 OP1       -0.052597
dtype: float64

In [9]:
portsize = portsize.stack()
portbm   = portbm.stack()
portop   = portop.stack()

portsize.index.names = ['YM','PORT']
portbm.index.names   = ['YM','PORT']
portop.index.names   = ['YM','PORT']

ports         = pd.DataFrame(index=portrets.index)
ports['R']    = portrets
ports['size'] = portsize
ports['BM']   = portbm
ports['OP']   = portop
ports.dropna(inplace=True)
ports.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,R,size,BM,OP
YM,PORT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-31,SMALL LoBM LoOP,-0.00075,0.2748,0.00184,-0.000452
1970-01-31,ME1 BM1 OP2,-0.061267,0.5146,0.002098,0.00235
1970-01-31,ME1 BM1 OP3,-0.086378,0.5138,0.002228,0.003037
1970-01-31,SMALL LoBM HiOP,-0.080667,0.5343,0.001808,0.005664
1970-01-31,ME1 BM2 OP1,-0.052597,0.2587,0.004251,0.00114


**We did some feature engineering, but the ratios and polynomials turn out to be distractive in prediction accuracy/mean squared error, hence we decided to continue with only: past moving average  3 original variables and their logs.**

In [10]:
ports['R'].describe()

count    18624.000000
mean         0.007214
std          0.059463
min         -0.446160
25%         -0.024577
50%          0.009691
75%          0.041278
max          1.301202
Name: R, dtype: float64

# Feature Engineering 

In [11]:
# Calculate past moving average of returns
past_list = []

for num in [6,12,18,24,30,36,48]:
    ports[f'MA{num}'] = ports.groupby(level='PORT')['R'].apply(lambda x: x.rolling(window=num).mean()).shift()
    past_list.append(f'MA{num}')
    
# Calculate past moving std of returns
for num in [6,12,18,24,30,36,48]:
    ports[f'STD{num}'] = ports.groupby(level='PORT')['R'].apply(lambda x: x.rolling(window=num).std()).shift()
    past_list.append(f'STD{num}')
    
ports.isna().sum()

R           0
size        0
BM          0
OP          0
MA6       161
MA12      353
MA18      545
MA24      737
MA30      929
MA36     1121
MA48     1505
STD6      161
STD12     353
STD18     545
STD24     737
STD30     929
STD36    1121
STD48    1505
dtype: int64

In [12]:
ports.dropna(inplace = True)

In [13]:
cols = ports.drop(columns = ['R']+past_list).columns.tolist()
# basic_cols = cols.copy()

In [14]:
# # Add ratio variables
# non_zero_cols = ports.drop(columns = 'R').columns.tolist()
# for c in ports.drop(columns = 'R').columns:
#     if (ports[c].values == 0).any():
#         non_zero_cols.remove(c)

# for i in cols:
#     for j in cols:
#         if i < j:
#             tmp_name = f'{i}/{j}'
#             ports[tmp_name] = ports[i]/ports[j]
# ports.head(2)

In [15]:
# # Add polynomial features
# from sklearn import preprocessing
# def polynomial_features(dataframe, num_cols):
#     df = dataframe.loc[:, num_cols].copy()
#     normal_num = df.shape[1]
#     pf = preprocessing.PolynomialFeatures(
#         degree=2,
#         interaction_only=False,
#         include_bias=False
#     )
#     pf.fit(df)
#     poly_feats = pf.transform(df)
#     poly_feats = poly_feats[:, normal_num:]
#     poly_columns = pf.get_feature_names_out(df.columns)[normal_num:]
#     poly_columns = [col.replace(' ', '-*-') for col in poly_columns]
#     df_poly = pd.DataFrame(poly_feats, columns=poly_columns)
#     return df_poly

# poly_df = polynomial_features(ports, cols)
# for col in poly_df.columns:
#     ports[col] = poly_df[col].tolist()

# ports.head(2)

In [16]:
# Add log

for col in cols:
    ports.loc[:, f'{col}_log'] = ports[col].apply(lambda x: -1000 if pd.isna(np.log(1+x)) else np.log(1+x))

In [17]:
# Check for null values

for c in ports.columns:
    if (ports[c].values == float('inf')).any() or (ports[c].values == float('-inf')).any() or ports[c].isna().sum() >0:
        print(c)

In [18]:
# Define Cutoffs as 0, 0.005, 0.01 , 0.015, 0.02 
cutoff_list = []
for cut in np.linspace(0, 0.02, 5):
    ports[f'>={cut}?'] = ports['R'].apply(lambda x: 1 if x >= cut else 0)
    cutoff_list.append(f'>={cut}?')


In [20]:
ports.to_csv('ports.csv')

# Modeling

In [21]:
ports = pd.read_csv('ports.csv',parse_dates=['YM']).set_index(['YM','PORT'])
buy_all = ports.groupby('YM')['R'].mean()
ba_stat = buy_all.describe()
buy_all_sharpe = ba_stat.loc['mean']/ba_stat.loc['std']*np.sqrt(12)
buy_all_sharpe

0.5837475152173438

In [22]:
ports2 = ports.reset_index().drop(columns = ['PORT']).set_index('YM')
cutoff_list = []
for cut in np.linspace(0, 0.02, 5):
    cutoff_list.append(f'>={cut}?')
all_Xs = ports2.drop(columns = ['R']+cutoff_list).columns
ports2.head()


Unnamed: 0_level_0,R,size,BM,OP,MA6,MA12,MA18,MA24,MA30,MA36,...,STD36,STD48,size_log,BM_log,OP_log,>=0.0?,>=0.005?,>=0.01?,>=0.015?,>=0.02?
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1973-12-31,-0.047954,0.2851,0.003868,0.001996,0.001452,-0.039267,-0.037339,-0.023937,-0.022972,-0.013901,...,0.077966,0.086654,0.250837,0.003861,0.001994,0,0,0,0,0
1973-12-31,-0.045188,0.3192,0.00389,0.002689,-0.011928,-0.038369,-0.0308,-0.019141,-0.016618,-0.007611,...,0.076138,0.086254,0.277025,0.003882,0.002685,0,0,0,0,0
1973-12-31,-0.026538,0.2427,0.002917,0.005025,-0.011243,-0.045983,-0.037123,-0.024922,-0.018502,-0.010345,...,0.073744,0.080394,0.217286,0.002913,0.005012,0,0,0,0,0
1973-12-31,-0.000509,0.2399,0.007598,0.000308,-0.013201,-0.048679,-0.03748,-0.024444,-0.01879,-0.009286,...,0.078114,0.083826,0.215031,0.007569,0.000308,0,0,0,0,0
1973-12-31,-0.047933,0.2182,0.007559,0.001997,0.01279,-0.028757,-0.021017,-0.012948,-0.0134,-0.005025,...,0.08439,0.090879,0.197374,0.007531,0.001995,0,0,0,0,0


In [23]:
def ports_predict(time, dataset, back_period, params, model, cutoff):
    # Limit the time window to {back_period} months before the month being predicted
    train_period = dataset[dataset.index < time].index.unique()
    train_data = dataset[(dataset.index <= train_period[-1])&(dataset.index >= train_period[-1*back_period])]
    
    X_train = train_data[all_Xs].values
    y_train = train_data[cutoff].values
    X_test = dataset.loc[time][all_Xs].values
    y_test = dataset.loc[time][cutoff].values
    
    model.set_params(**params)
    mod = model.fit(X_train, y_train)
    pred_result = mod.predict(X_test)
    pred_proba = mod.predict_proba(X_test)
    
    return pred_result, pred_proba[:,1]


In [24]:
def tuning_params(all_params):
    # params should be in the format of dictionary
    keys, values = zip(*all_params.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    
    return permutations_dicts

### Details of Model Calculations

In [25]:
# Example of decision tree hyperparameters
dt_params = {'criterion': ['gini'],
        'min_samples_split': [2,3],
        'max_depth':[None],
        'min_samples_leaf':[3,4],
        'max_features': ['auto']}
tuning_params(dt_params)

[{'criterion': 'gini',
  'min_samples_split': 2,
  'max_depth': None,
  'min_samples_leaf': 3,
  'max_features': 'auto'},
 {'criterion': 'gini',
  'min_samples_split': 2,
  'max_depth': None,
  'min_samples_leaf': 4,
  'max_features': 'auto'},
 {'criterion': 'gini',
  'min_samples_split': 3,
  'max_depth': None,
  'min_samples_leaf': 3,
  'max_features': 'auto'},
 {'criterion': 'gini',
  'min_samples_split': 3,
  'max_depth': None,
  'min_samples_leaf': 4,
  'max_features': 'auto'}]

In [26]:
%%time
# Illustration of how the model works
# Using the example training period = 60 months and cutoff = 0
trial1 = pd.DataFrame()

# Starting point of prediction will be the first month after the pre-determined number of months used to train model (backtest period)
trial1['R'] = ports2[ports2.index >= ports2.index[0]+dr.relativedelta(months = 60)]['R'] 
trial1['y'] = ports2[ports2.index >= ports2.index[0]+dr.relativedelta(months = 60)]['>=0.0?']
trial1['pred'] = np.nan
trial1['prob'] = np.nan
for time in trial1.index.unique():
    a,b = ports_predict(time, ports2, 
                              back_period = 60, 
                            params = tuning_params(dt_params)[0],
                              model = DecisionTreeClassifier(),
                              cutoff = '>=0.0?')
    trial1.loc[time,'pred'] = a
    trial1.loc[time,'prob'] = b
trial1


CPU times: user 3.75 s, sys: 17.5 ms, total: 3.77 s
Wall time: 3.77 s


Unnamed: 0_level_0,R,y,pred,prob
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1978-12-31,0.005475,1,1.0,1.000000
1978-12-31,0.012230,1,1.0,1.000000
1978-12-31,0.040365,1,1.0,1.000000
1978-12-31,0.020241,1,0.0,0.000000
1978-12-31,0.010291,1,1.0,0.666667
...,...,...,...,...
2018-06-30,0.060138,1,0.0,0.000000
2018-06-30,-0.009813,0,1.0,0.600000
2018-06-30,-0.010214,0,0.0,0.000000
2018-06-30,-0.026346,0,1.0,1.000000


In [27]:
# Model accuracy
(trial1['y'] == trial1['pred']).sum()/len(trial1)

0.5511842105263158

In [28]:
# Annualized Sharpe
trial1['buy'] = trial1['R']*trial1['pred']
stats1 = trial1['buy'].groupby('YM').mean().describe()
stats1.loc['mean']/stats1.loc['std']*np.sqrt(12)

0.9635065625005116

In [29]:
buy_all_sharpe

0.5837475152173438

**For the purpose of presenting results more neatly, we will hide number details in the following and only show you the accuracy of models their sharpe ratios.**

Things to control over: 
- length of period used to train data, 
- cutoff points，
- models
- model hyperparameters

In [30]:
def All_summary(model, cutoffs, back_period_list, all_params):
    new_df = pd.DataFrame(columns = ['Model','Cutoff','Backtest period','Accuracy','Annualized Sharpe','Parameters'])
    
    m = f'{model}'
    c_list = []
    p_list = []
    pa_list = []
    accuracy_list = []
    sharpe_list = []
    count = 1
    
    # Codes below will run through all combinations of cutoffs, prediction periods, and hyperparameters
    # Every other part is the same as the detailed illustration above except that only summary statistics (accuracy & sharpe) are shown this time
    for c in cutoffs:
        for p in back_period_list:
            for pa in all_params:
                temp_df = pd.DataFrame()
                target_period = ports2[ports2.index >= ports2.index[0]+dr.relativedelta(months = p)]
                temp_df['R'] = target_period['R']
                temp_df['y'] = target_period[c]
                temp_df['pred'] = np.nan
                temp_df['prob'] = np.nan
                for time in temp_df.index.unique():
                    a,b = ports_predict(time, ports2, 
                                              back_period = p, 
                                            params = pa,
                                              model = model,
                                              cutoff = c)
                    temp_df.loc[time,'pred'] = a
                    # temp_df.loc[time,'prob'] = b
                accuracy = (temp_df['y'] == temp_df['pred']).sum()/len(temp_df)
                stats = (temp_df['R']*temp_df['pred']).groupby('YM').mean().describe()
                sharpe = stats.loc['mean']/stats.loc['std']*np.sqrt(12)
                
                c_list.append(c);p_list.append(p);pa_list.append(pa)
                accuracy_list.append(accuracy);sharpe_list.append(sharpe)
                
                # Keep track of computations
                print(f'Model No.{count} is done.')
                count += 1 
                
    new_df['Cutoff'] = c_list
    new_df['Backtest period'] = p_list
    new_df['Accuracy'] = accuracy_list
    new_df['Annualized Sharpe'] = sharpe_list
    new_df['Parameters'] = pa_list
    new_df['Model'] = m
    
    # Insert the baseline performance into the summary table
    new_df.loc[len(new_df.index)] = ['Buy all', 'N/A', 'N/A', 'N/A', buy_all_sharpe, 'N/A']
    
    return new_df
    

### Example of All_summary results with Decision Tree Classifier

In [31]:
# Example of decision tree hyperparameters
dt_params = {'criterion': ['gini'],
             'max_depth':[None],
        'min_samples_split': [3],
        'min_samples_leaf':range(1,2),
        'max_features': ['auto']}

dt_df = All_summary(model = DecisionTreeClassifier(), 
                    cutoffs = cutoff_list[:1], 
                    back_period_list = [60], 
                    all_params = tuning_params(dt_params))


Model No.1 is done.


In [44]:
dt_df

Unnamed: 0,Model,Cutoff,Backtest period,Accuracy,Annualized Sharpe,Parameters
0,DecisionTreeClassifier(),>=0.0?,60.0,0.536908,0.901145,"{'criterion': 'gini', 'max_depth': None, 'min_..."
1,Buy all,,,,0.583748,
