In [67]:
import optuna
from tqdm.notebook import tqdm
import multiprocessing
from joblib import Parallel, delayed
from production.ARGEN import *
from production.utility_function import *
import numpy as np

In [68]:
# configuration

target_component_number=50
lower_bound = 0.0041
upper_bound = 0.8
training_month=12
optuna_trial_number = 500
optuna_study_name = 'buy-and-hold-study-arls-vs-argen-up-0.6-n-50-s-500'




In [69]:

data = get_data()

X = data['X']
y = data['y']
dates = data['dates']

time_vec = np.array(list(range(1, X.shape[0] + 1))) - 1

_, p = X.shape

update_frequency = 'buy-and-hold'


training_val_index, testing_index = get_data_index(time_vec, update_frequency, training_month)

training_val_ind = training_val_index[0]
training_ind, val_ind = training_val_ind[0:-int(len(training_val_ind)*0.2)], training_val_ind[-int(len(training_val_ind)*0.2)::]
testing_ind = testing_index[0]

X_train = X[training_ind, :]
X_val = X[val_ind, :]
X_test = X[testing_ind, :]
y_train = y[training_ind]
y_val = y[val_ind]
y_test = y[testing_ind]

fs_clf = FeatureSelectionRegressor(p, target_component_number)

fs_clf.fit(X_train, y_train)
fs_clf.score(X_test, y_test)

selected_feature_ind = np.where(fs_clf.coef_ != 0.0)[0]

X_train_ = X_train[:, selected_feature_ind]
X_val_ = X_val[:, selected_feature_ind]
X_test_ = X_test[:, selected_feature_ind]

_, p_ = X_train_.shape

lowbo = np.ones(p_) * lower_bound
upbo = np.ones(p_) * upper_bound

# lowbo = np.ones(p_) * 0.0041
# upbo = np.ones(p_) * 0.8


#%%
arls_clf = ARGEN(p_, 0, 0, lowbo, upbo, 0, 0)

arls_clf.fit(X_train_, y_train)

coef_ = arls_clf.coef_

arls_coef = coef_/np.sum(coef_)

arls_val_score = arls_clf.score(X_val_, y_val)

arls_test_score = arls_clf.score(X_test_, y_test)

print('arls_val_score', arls_val_score, 'arls_test_score', arls_test_score)

arls_portfolio_return_test = np.matmul(X_test_, arls_coef.reshape((-1, 1)))


  def mydateparser(x): return pd.datetime.strptime(x, "%Y-%m-%d")


arls_val_score 4.6714385173885905e-06 arls_test_score 2.265148216421383e-05


In [70]:
print('MSE', arls_clf.score(X_test_, y_test))
print('AV', calculated_annual_volatility(arls_portfolio_return_test))
print('TE', calculate_daily_tracking_error(arls_portfolio_return_test, y_test))
print('CR', calculate_cumulative_return(arls_portfolio_return_test))


MSE 2.265148216421383e-05
AV 0.24295350743632715
TE 0.020075083485919975
CR 1.1285534755839084


In [71]:
# retrive validation hyperparameter tunning results

study_name = optuna_study_name  # Unique identifier of the study.

study = optuna.create_study(study_name=study_name,
                            storage='postgresql://argen:argen@db:5432/argen',
                            load_if_exists=True)

df = study.trials_dataframe()

valid_coef_df = df[df.value<arls_val_score]

def get_test_score(coef):
    arls_clf = ARGEN(p_, lam_1=coef['params_lam_1'], lam_2=coef['params_lam_2'], lowbo=lowbo, upbo=upbo, 
                     wvec_random_state=coef['params_wvec_random_state'], 
                     sigma_random_state=coef['params_sigma_random_state'])
    arls_clf.fit(X_train_, y_train)
    return arls_clf.score(X_test_, y_test)


param_list = [coef for ind, coef in valid_coef_df.iterrows()]

num_cores = multiprocessing.cpu_count()

inputs = tqdm(param_list)

valid_coef_df['test_score'] = Parallel(n_jobs=num_cores)(delayed(get_test_score)(coef) for coef in inputs)

print('min, mean, and max test scores')
print(valid_coef_df['test_score'].min(), valid_coef_df['test_score'].mean(),  valid_coef_df['test_score'].max())


[32m[I 2021-03-21 19:29:50,889][0m Using an existing study with name 'buy-and-hold-study-arls-vs-argen-up-0.6-n-50-s-500' instead of creating a new one.[0m


  0%|          | 0/118 [00:00<?, ?it/s]

min, mean, and max test scores
2.319831961749319e-05 3.0425751995199842e-05 4.899643551391322e-05


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_coef_df['test_score'] = Parallel(n_jobs=num_cores)(delayed(get_test_score)(coef) for coef in inputs)


In [72]:

# best parameter from the val scores
min_val_score_coef = df[df.value == df.value.min()]

argen_clf = ARGEN(p_, lam_1=min_val_score_coef['params_lam_1'].values[0],
                 lam_2=min_val_score_coef['params_lam_2'].values[0],
                 lowbo=lowbo, upbo=upbo, 
                 wvec_random_state=min_val_score_coef['params_wvec_random_state'].values[0], 
                 sigma_random_state=min_val_score_coef['params_sigma_random_state'].values[0])

argen_clf.fit(X_train_, y_train)

argen_coef = argen_clf.coef_/sum(coef_)

argen_portfolio_return_test = np.matmul(X_test_, argen_coef.reshape((-1, 1)))



In [73]:
print('MSE', argen_clf.score(X_test_, y_test), arls_clf.score(X_test_, y_test))
print('AV', calculated_annual_volatility(argen_portfolio_return_test), calculated_annual_volatility(arls_portfolio_return_test), calculated_annual_volatility(y_test))
print('TE', calculate_daily_tracking_error(argen_portfolio_return_test, y_test), calculate_daily_tracking_error(arls_portfolio_return_test, y_test))
print('CR', calculate_cumulative_return(argen_portfolio_return_test), calculate_cumulative_return(arls_portfolio_return_test), calculate_cumulative_return(y_test))


MSE 2.7168860417203135e-05 2.265148216421383e-05
AV 0.22917847381703033 0.24295350743632715 0.206232527550782
TE 0.01942166073942521 0.020075083485919975
CR 1.0029043519800602 1.1285534755839084 0.6646974259514793
