In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
import dateutil.relativedelta as dr
import itertools
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load models

In [3]:
ports = pd.read_csv('ports.csv',parse_dates=['YM']).set_index(['YM','PORT'])
buy_all = ports.groupby('YM')['R'].mean()
ba_stat = buy_all.describe()
buy_all_sharpe = ba_stat.loc['mean']/ba_stat.loc['std']*np.sqrt(12)

ports2 = ports.reset_index().drop(columns = ['PORT']).set_index('YM')
cutoff_list = []
for cut in np.linspace(0, 0.02, 5):
    cutoff_list.append(f'>={cut}?')
all_Xs = ports2.drop(columns = ['R']+cutoff_list).columns
ports2.head()


Unnamed: 0_level_0,R,size,BM,OP,MA6,MA12,MA18,MA24,MA30,MA36,...,STD36,STD48,size_log,BM_log,OP_log,>=0.0?,>=0.005?,>=0.01?,>=0.015?,>=0.02?
YM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1973-12-31,-0.047954,0.2851,0.003868,0.001996,0.001452,-0.039267,-0.037339,-0.023937,-0.022972,-0.013901,...,0.077966,0.086654,0.250837,0.003861,0.001994,0,0,0,0,0
1973-12-31,-0.045188,0.3192,0.00389,0.002689,-0.011928,-0.038369,-0.0308,-0.019141,-0.016618,-0.007611,...,0.076138,0.086254,0.277025,0.003882,0.002685,0,0,0,0,0
1973-12-31,-0.026538,0.2427,0.002917,0.005025,-0.011243,-0.045983,-0.037123,-0.024922,-0.018502,-0.010345,...,0.073744,0.080394,0.217286,0.002913,0.005012,0,0,0,0,0
1973-12-31,-0.000509,0.2399,0.007598,0.000308,-0.013201,-0.048679,-0.03748,-0.024444,-0.01879,-0.009286,...,0.078114,0.083826,0.215031,0.007569,0.000308,0,0,0,0,0
1973-12-31,-0.047933,0.2182,0.007559,0.001997,0.01279,-0.028757,-0.021017,-0.012948,-0.0134,-0.005025,...,0.08439,0.090879,0.197374,0.007531,0.001995,0,0,0,0,0


In [4]:
def ports_predict(time, dataset, back_period, params, model, cutoff):
    # Limit the time window to {back_period} months before the month being predicted
    train_period = dataset[dataset.index < time].index.unique()
    train_data = dataset[(dataset.index <= train_period[-1])&(dataset.index >= train_period[-1*back_period])]
    
    X_train = train_data[all_Xs].values
    y_train = train_data[cutoff].values
    X_test = dataset.loc[time][all_Xs].values
    y_test = dataset.loc[time][cutoff].values
    
    model.set_params(**params)
    mod = model.fit(X_train, y_train)
    pred_result = mod.predict(X_test)
    pred_proba = mod.predict_proba(X_test)
    
    return pred_result, pred_proba[:,1]


In [5]:
def tuning_params(all_params):
    # params should be in the format of dictionary
    keys, values = zip(*all_params.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    
    return permutations_dicts

In [6]:
def All_summary(model, cutoffs, back_period_list, all_params):
    new_df = pd.DataFrame(columns = ['Model','Cutoff','Backtest period','Accuracy','Annualized Sharpe','Parameters'])
    
    m = f'{model}'
    c_list = []
    p_list = []
    pa_list = []
    accuracy_list = []
    sharpe_list = []
    count = 1
    
    # Codes below will run through all combinations of cutoffs, prediction periods, and hyperparameters
    # Every other part is the same as the detailed illustration above except that only summary statistics (accuracy & sharpe) are shown this time
    for c in cutoffs:
        for p in back_period_list:
            for pa in all_params:
                temp_df = pd.DataFrame()
                target_period = ports2[ports2.index >= ports2.index[0]+dr.relativedelta(months = p)]
                temp_df['R'] = target_period['R']
                temp_df['y'] = target_period[c]
                temp_df['pred'] = np.nan
                temp_df['prob'] = np.nan
                for time in temp_df.index.unique():
                    a,b = ports_predict(time, ports2, 
                                              back_period = p, 
                                            params = pa,
                                              model = model,
                                              cutoff = c)
                    temp_df.loc[time,'pred'] = a
                    # temp_df.loc[time,'prob'] = b
                accuracy = (temp_df['y'] == temp_df['pred']).sum()/len(temp_df)
                stats = (temp_df['R']*temp_df['pred']).groupby('YM').mean().describe()
                sharpe = stats.loc['mean']/stats.loc['std']*np.sqrt(12)
                
                c_list.append(c);p_list.append(p);pa_list.append(pa)
                accuracy_list.append(accuracy);sharpe_list.append(sharpe)
                
                # Keep track of computations
                print(f'Model No.{count} is done.')
                count += 1 
                
    new_df['Cutoff'] = c_list
    new_df['Backtest period'] = p_list
    new_df['Accuracy'] = accuracy_list
    new_df['Annualized Sharpe'] = sharpe_list
    new_df['Parameters'] = pa_list
    new_df['Model'] = m
    
    # Insert the baseline performance into the summary table
    new_df.loc[len(new_df.index)] = ['Buy all', 'N/A', 'N/A', 'N/A', buy_all_sharpe, 'N/A']
    
    return new_df
    
                
    

In [7]:
# Example
# params format: dictionary, all values in one key are in a list
dt_params = {'criterion': ['gini'],
        'min_samples_split': [3,4],
        'max_depth':[None],
        'min_samples_leaf':range(1,3),
        'max_features': ['auto']}

dt_df = All_summary(model = DecisionTreeClassifier(), 
                    cutoffs = cutoff_list, 
                    back_period_list = [60], 
                    all_params = tuning_params(dt_params))


Model No.1 is done.
Model No.2 is done.
Model No.3 is done.
Model No.4 is done.
Model No.5 is done.
Model No.6 is done.
Model No.7 is done.
Model No.8 is done.
Model No.9 is done.
Model No.10 is done.
Model No.11 is done.
Model No.12 is done.
Model No.13 is done.
Model No.14 is done.
Model No.15 is done.
Model No.16 is done.
Model No.17 is done.
Model No.18 is done.
Model No.19 is done.
Model No.20 is done.


In [8]:
dt_df

Unnamed: 0,Model,Cutoff,Backtest period,Accuracy,Annualized Sharpe,Parameters
0,DecisionTreeClassifier(),>=0.0?,60.0,0.547368,0.978332,"{'criterion': 'gini', 'min_samples_split': 3, ..."
1,DecisionTreeClassifier(),>=0.0?,60.0,0.547368,1.051682,"{'criterion': 'gini', 'min_samples_split': 3, ..."
2,DecisionTreeClassifier(),>=0.0?,60.0,0.543092,0.919607,"{'criterion': 'gini', 'min_samples_split': 4, ..."
3,DecisionTreeClassifier(),>=0.0?,60.0,0.541776,0.996384,"{'criterion': 'gini', 'min_samples_split': 4, ..."
4,DecisionTreeClassifier(),>=0.005?,60.0,0.530132,0.890141,"{'criterion': 'gini', 'min_samples_split': 3, ..."
5,DecisionTreeClassifier(),>=0.005?,60.0,0.531711,0.886652,"{'criterion': 'gini', 'min_samples_split': 3, ..."
6,DecisionTreeClassifier(),>=0.005?,60.0,0.528158,0.868346,"{'criterion': 'gini', 'min_samples_split': 4, ..."
7,DecisionTreeClassifier(),>=0.005?,60.0,0.533882,1.006773,"{'criterion': 'gini', 'min_samples_split': 4, ..."
8,DecisionTreeClassifier(),>=0.01?,60.0,0.532895,0.898464,"{'criterion': 'gini', 'min_samples_split': 3, ..."
9,DecisionTreeClassifier(),>=0.01?,60.0,0.528816,0.872827,"{'criterion': 'gini', 'min_samples_split': 3, ..."


In [9]:
# Example
# params format: dictionary, all values in one key are in a list
dt_params = {'criterion': ['gini'],
        'min_samples_split': [3,4],
        'max_depth':[None],
        'min_samples_leaf':range(1,3),
        'max_features': ['auto']}

dt_df = All_summary(model = DecisionTreeClassifier(), 
                    cutoffs = cutoff_list, 
                    back_period_list = [48,72,84,96], 
                    all_params = tuning_params(dt_params))


Model No.1 is done.
Model No.2 is done.
Model No.3 is done.
Model No.4 is done.
Model No.5 is done.
Model No.6 is done.
Model No.7 is done.
Model No.8 is done.
Model No.9 is done.
Model No.10 is done.
Model No.11 is done.
Model No.12 is done.
Model No.13 is done.
Model No.14 is done.
Model No.15 is done.
Model No.16 is done.
Model No.17 is done.
Model No.18 is done.
Model No.19 is done.
Model No.20 is done.
Model No.21 is done.
Model No.22 is done.
Model No.23 is done.
Model No.24 is done.
Model No.25 is done.
Model No.26 is done.
Model No.27 is done.
Model No.28 is done.
Model No.29 is done.
Model No.30 is done.
Model No.31 is done.
Model No.32 is done.
Model No.33 is done.
Model No.34 is done.
Model No.35 is done.
Model No.36 is done.
Model No.37 is done.
Model No.38 is done.
Model No.39 is done.
Model No.40 is done.
Model No.41 is done.
Model No.42 is done.
Model No.43 is done.
Model No.44 is done.
Model No.45 is done.
Model No.46 is done.
Model No.47 is done.
Model No.48 is done.
M

In [11]:
dt_df.to_clipboard()