In [1]:
## Inflation Forecast
## Same as version 3, but do partially linear model
import os
import time
import pickle
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
import sqlite3
import random
import re
import xgboost as xgb
import tensorflow as tf
# from functions import *

# Seed number used
seed = 42

# Transform:
price_var = ['WPSFD49207', 'WPSFD49502', 'WPSID61', 'WPSID62', 'OILPRICEx', 'PPICMM', 'CPIAUCSL', 
             'CPIAPPSL', 'CPITRNSL', 'CPIMEDSL', 'CUSR0000SAC', 'CUSR0000SAD', 'CUSR0000SAS', 
             'CPIULFSL', 'CUSR0000SA0L2', 'CUSR0000SA0L5', 'PCEPI', 'DDURRG3M086SBEA', 'DNDGRG3M086SBEA', 'DSERRG3M086SBEA']

# Transform:
# Transformation = 'No Transform'
Transformation = 'Transform' 
# :Transforms according to the recommendations given by McCracken and Ng (2015) for all but Group 7 (Prices),
#  which are transformed as year over year growth



# Make Database
con = sqlite3.connect(os.path.join('Data', 'database_predict_inflation.db'))
cur = con.cursor()

res = cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""")
table_names = res.fetchall()
if ~np.isin('Results', table_names):
    print("CREATE NEW DATABASE")
    cur.execute("""CREATE TABLE IF NOT EXISTS Results(
                Date TEXT NOT NULL,
                Target TEXT NOT NULL,
                Value REAL NOT NULL,
                Prediction REAL NOT NULL,
                Model TEXT NOT NULL,
                Seed INTEGER NOT NULL,
                Parameter TEXT,
                Window_size INTEGER NOT NULL,
                Validation_size INTEGER NOT NULL,
                Transformation TEXT NOT NULL,
                PRIMARY KEY (Date, Target, Model, Seed, Window_size, Validation_size, Transformation))""")
    con.commit()
else:
    print("DATABASE ALREADY EXISTS")
    con.commit()

Data = pd.read_csv(os.path.join('Data', '2024-06.csv'))
p = Data.shape[1]

temp_list = []
for i in range(1,p):
    tcode = Data.iloc[0,i].copy()
    data = Data.iloc[1:,i].copy()
    if data.name == 'CPIAUCSL':
        data_transform = np.log(data).diff(periods=12)           # Year over year growth
    elif np.isin(data.name, list(set(price_var)-set(['CPIAUCSL']))):
        if Transformation == 'Transform':
            data_transform = np.log(data).diff(periods=12)       # Year over year growth
        else:
            data_transform = data
    else:
        if np.isin(Transformation, ['Transform']):
            if tcode == 1:
                data_transform = data
            elif tcode == 2:  # First difference
                data_transform = data.diff()
            elif tcode == 3: # Second difference
                data_transform = data.diff().diff()
            elif tcode == 4: # Log
                data_transform = np.log(data)
            elif tcode == 5: #First difference of log
                data_transform = np.log(data).diff()
            elif tcode == 6: #Second difference of natural log
                data_transform = np.log(data).diff().diff()
            elif tcode == 7: # First difference of percent change
                data_transform = data.pct_change().diff()
        elif Transformation == 'No Transform':
            data_transform = data
    temp_list.append(data_transform.copy())
Data_transform = pd.DataFrame(temp_list).T

Date = Data.iloc[1:,0]

Y = Data_transform['CPIAUCSL'] # Inflation

num_lags = 2
X = Data_transform
for p in range(1, num_lags):
    X['CPIAUCSL_lag%i' % p] = Y.shift(p)

h = 1 # One step ahead forecast
X = X.shift(h)

X_used = X.iloc[12+num_lags:,:].reset_index(drop=True)
V_label = ['CPIAUCSL']
for p in range(1,num_lags):
    V_label.append('CPIAUCSL_lag%i' % p)
V_used = X_used[V_label]

W_used = X_used.drop(V_label,axis=1)
# np.sum(np.sum(np.isnan(X_used))==0)
Y_used = Y.iloc[12+num_lags:].reset_index(drop=True)

Date_used = Date.iloc[12+num_lags:].reset_index(drop=True)
Date_used = pd.to_datetime(Date_used)

# plt.plot(np.sum(~np.isnan(X_used),axis=1))
# plt.show()
n = X_used.shape[0]

forecast_period = pd.to_datetime('2015-01-01')<=Date_used
forecast_idx = np.where(forecast_period)[0]
n_test = np.sum(forecast_period)

validation_period = (pd.to_datetime('2005-08-01')<=Date_used) & (pd.to_datetime('2015-01-01') > Date_used)
validation_idx = np.where(validation_period)[0]
n_val = np.sum(validation_period)

training_period = pd.to_datetime('2005-08-01')>Date_used
training_idx = np.where(training_period)[0]
n_train = np.sum(training_period)


V_train = V_used.loc[training_period]
W_train = W_used.loc[training_period,:]
Y_train = Y_used.loc[training_period]

V_val = V_used.loc[validation_period]
W_val = W_used.loc[validation_period,:]
Y_val = Y_used.loc[validation_period]

V_test = V_used.loc[forecast_period]
W_test = W_used.loc[forecast_period,:]
Y_test = Y_used.loc[forecast_period]

nnan_idx = np.sum(np.isnan(W_used),axis=0)==0
W_used_nnan = W_used.loc[:,nnan_idx]

W_train_nnan = W_used_nnan.loc[training_period,:]
W_val_nnan = W_used_nnan.loc[validation_period,:]
W_test_nnan = W_used_nnan.loc[forecast_period,:]

Validation_Err = {}
beta_tot = {}

DATABASE ALREADY EXISTS


In [2]:
#######################################################################################
#######################   Partially Linear Random Forest  #############################
#######################################################################################
max_depth_list = np.append(np.arange(1,50,3),None)
val_err = np.zeros((n_val, len(max_depth_list)))
RFmodel_dict = {}
beta_dict_RF = {}
t = time.time()
for cv_i, max_depth in enumerate(max_depth_list):
    model1 = RandomForestRegressor(n_estimators=100, criterion='squared_error',
                                    max_depth=max_depth,random_state=seed)
    model2 = RandomForestRegressor(n_estimators=100, criterion='squared_error',
                                    max_depth=max_depth,random_state=seed)
    RFmodel_dict[cv_i] = RandomForestRegressor(n_estimators=100, criterion='squared_error',
                                    max_depth=max_depth,random_state=seed)    
    model1.fit(W_train_nnan, V_train)
    
    resi_v = V_train.values - model1.predict(W_train_nnan)
    model2.fit(W_train_nnan, Y_train)
    resi_y = Y_train.values - model2.predict(W_train_nnan)
    beta = np.linalg.inv(resi_v.T @ resi_v) @ resi_v.T @ resi_y
    
    resi_train = Y_train.values - V_train.values @ beta
    
    RFmodel_dict[cv_i].fit(W_train_nnan, resi_train)
    resi_hat = RFmodel_dict[cv_i].predict(W_val_nnan)
    
    Y_hat = V_val.values @ beta + resi_hat
    beta_dict_RF[cv_i] = beta
    val_err[:, cv_i] = Y_val.values-Y_hat
elapsed = time.time() - t
print('elapsed time = %0.2f sec; %s'%(elapsed, datetime.datetime.now()))

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_RF = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['RF'] = pd.DataFrame()
Validation_Err['RF']['max_depth_list'] = max_depth_list
Validation_Err['RF']['val_err'] = val_err_RF
beta_tot['RF'] = beta_dict_RF[min_idx]

temp_grid = ['None' if x==None else x for x in max_depth_list]
plt.plot(temp_grid, val_err_RF)
plt.xlabel('max_depth')
plt.title('Validation Error, Random Forest, Minimum=%s'%str(temp_grid[min_idx]))
# plt.savefig("Figures/RF_validation_seed%i.png"%seed)
# plt.close()
plt.show()


Y_hat = V_test.values @ beta_dict_RF[min_idx] + RFmodel_dict[min_idx].predict(W_test_nnan)
test_err_RF = Y_test.values - Y_hat
RMSE_RF = np.sqrt(np.sum(test_err_RF**2)/len(test_err_RF))
print('The RMSE of PL-RF model is %f'%RMSE_RF)

RF_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'Random Forest-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(max_depth_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
RF_out = pd.DataFrame.from_dict(RF_out)

NameError: name 'datetime' is not defined

In [None]:
#######################################################################################
###################################   XGBoost  ########################################
#######################################################################################
n_estimators_list = np.arange(1,50,3)
val_err = np.zeros((n_val,len(n_estimators_list)))
XGBmodel_dict = {}
beta_dict_XGB = {}
t = time.time()
for cv_i, n_estimators in enumerate(n_estimators_list):

    model1 = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed)
    model2 = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed)
    XGBmodel_dict[cv_i] = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed)

    model1.fit(W_train_nnan, V_train)
    resi_v = V_train.values - model1.predict(W_train_nnan)
    model2.fit(W_train_nnan, Y_train)
    resi_y = Y_train.values - model2.predict(W_train_nnan)
    
    beta = np.linalg.inv(resi_v.T @ resi_v) @ (resi_v.T @ resi_y)
    resi_train = Y_train.values - V_train.values @ beta
    
    XGBmodel_dict[cv_i].fit(W_train_nnan, resi_train)
    resi_hat = XGBmodel_dict[cv_i].predict(W_val_nnan)
    Y_hat = V_val.values @ beta + resi_hat
    
    beta_dict_XGB[cv_i] = beta
    val_err[:, cv_i] = Y_val.values-Y_hat
elapsed = time.time() - t
print('elapsed time = %0.2f sec; %s'%(elapsed, datetime.datetime.now()))

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_XGB = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['XGB'] = pd.DataFrame()
Validation_Err['XGB']['n_estimators_list'] = n_estimators_list
Validation_Err['XGB']['val_err'] = val_err_XGB
beta_tot['XGB'] = beta_dict_XGB[min_idx]

plt.plot(n_estimators_list,val_err_XGB)
plt.xlabel('n_estimators')
plt.title('Validation Error, XGBoost, Minimum=%i'%n_estimators_list[min_idx])
# plt.savefig("Figures/XGB_validation_seed%i.png"%seed)
# plt.close()
# plt.show()

Y_hat = V_test.values @ beta_dict_XGB[min_idx] + XGBmodel_dict[min_idx].predict(W_test_nnan)
test_err_XGB = Y_test.values - Y_hat
RMSE_XGB = np.sqrt(np.sum(test_err_XGB**2)/len(test_err_XGB))
print('The RMSE of PL-XGB model is %f'%RMSE_XGB)

XGB_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'XGBoost-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(n_estimators_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
XGB_out = pd.DataFrame.from_dict(XGB_out)

In [None]:
#######################################################################################
##############################   XGBoost with subsampling #############################
#######################################################################################

num_parallel_tree = 100
subsample = np.sqrt(W_train_nnan.shape[0])/W_train_nnan.shape[0]
n_estimators_list = np.arange(1,50,3)
val_err = np.zeros((n_val,len(n_estimators_list)))
XGBmodel_dict = {}
beta_dict_XGBs = {}
t = time.time()
for cv_i, n_estimators in enumerate(n_estimators_list):

    model1 = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed,
                            num_parallel_tree = num_parallel_tree, subsample = subsample)
    model2 = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed,
                            num_parallel_tree = num_parallel_tree, subsample = subsample)
    XGBmodel_dict[cv_i] = xgb.XGBRegressor(n_jobs=1, tree_method="exact", n_estimators=n_estimators, random_state=seed,
                                            num_parallel_tree = num_parallel_tree, subsample = subsample)

    model1.fit(W_train_nnan, V_train)
    resi_v = V_train.values - model1.predict(W_train_nnan)
    model2.fit(W_train_nnan, Y_train)
    resi_y = Y_train.values - model2.predict(W_train_nnan)
    beta = np.linalg.inv(resi_v.T @ resi_v) @ (resi_v.T @ resi_y)
    resi_train = Y_train.values - V_train.values @ beta
    
    XGBmodel_dict[cv_i].fit(W_train_nnan, resi_train)
    resi_hat = XGBmodel_dict[cv_i].predict(W_val_nnan)
    Y_hat = V_val.values @ beta + resi_hat
    
    beta_dict_XGBs[cv_i] = beta
    val_err[:, cv_i] = Y_val.values-Y_hat
elapsed = time.time() - t
print('elapsed time = %0.2f sec; %s'%(elapsed, datetime.datetime.now()))

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_XGBs = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['XGBs'] = pd.DataFrame()
Validation_Err['XGBs']['n_estimators_list'] = n_estimators_list
Validation_Err['XGBs']['val_err'] = val_err_XGBs
beta_tot['XGBs'] = beta_dict_XGBs[min_idx]

plt.plot(n_estimators_list, val_err_XGBs)
plt.xlabel('n_estimators')
plt.title('Validation Error, XGBoost with subsample, Minimum=%i'%n_estimators_list[min_idx])
# plt.savefig("Figures/XGBs_validation_seed%i.png"%seed)
# plt.close()
# plt.show()

Y_hat = V_test.values @ beta_dict_XGBs[min_idx] + XGBmodel_dict[min_idx].predict(W_test_nnan)
test_err_XGBs = Y_test.values - Y_hat
RMSE_XGBs = np.sqrt(np.sum(test_err_XGBs**2)/len(test_err_XGBs))
print('The RMSE of PL-XGBs model is %f'%RMSE_XGBs)

XGBs_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'XGBoost-subsample-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(n_estimators_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
XGBs_out = pd.DataFrame.from_dict(XGBs_out)

In [None]:
#######################################################################################
####################################   PCR    #########################################
#######################################################################################
W_train_stzd = (W_train_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_val_stzd = (W_val_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_test_stzd = (W_test_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)    

Sigma_hat = W_train_stzd.T@W_train_stzd/n_train
eigval, eigvec = np.linalg.eig(Sigma_hat)
eigval = np.real(eigval)
eigvec = np.real(eigvec)
idx = eigval.argsort()[::-1]
eigval_sorted = eigval[idx]
eigvec_sorted = eigvec[:, idx]
F_train = W_train_stzd @ eigvec_sorted
F_val = W_val_stzd @ eigvec_sorted
F_val.columns = F_train.columns
F_test = W_test_stzd @ eigvec_sorted
F_test.columns = F_test.columns

# fig, ax = plt.subplots()
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,0], label='First basis')
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,1], label='Second basis')
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,2], label='Third basis')
# plt.xticks(rotation=-45)
# plt.legend()
# plt.show()


nfactors_list = np.arange(1,20)
val_err = np.zeros((n_val, len(nfactors_list)))
nfactors = 2
PCR_dict = {}
beta_dict_PCR = {}
for cv_i, nfactors in enumerate(nfactors_list):

    PCR_dict[cv_i] = LinearRegression(fit_intercept=True)
    PCR_dict[cv_i].fit(np.concatenate((V_train.values,F_train.iloc[:,:nfactors]),axis=1), Y_train)
    
    
    Y_hat = PCR_dict[cv_i].predict(np.concatenate((V_val.values,F_val.iloc[:,:nfactors]),axis=1))
    
    beta_dict_PCR[cv_i] = PCR_dict[cv_i].coef_[:2]
    val_err[:, cv_i] = Y_val.values-Y_hat

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_PCR = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['PCR'] = pd.DataFrame()
Validation_Err['PCR']['nfactors_list'] = nfactors_list
Validation_Err['PCR']['val_err'] = val_err_PCR
beta_tot['PCR'] = beta_dict_PCR[min_idx]

plt.plot(nfactors_list, val_err_PCR)
plt.xlabel('Number of principal components')
plt.title('Validation Error, PCR,Minimum=%i'%nfactors_list[min_idx])
plt.show()
# plt.savefig("Figures/PCR_validation_seed%i.png"%seed)
# plt.close()
# # plt.show()


Y_hat = PCR_dict[min_idx].predict(np.concatenate((V_test.values,F_test.iloc[:,:nfactors_list[min_idx]]),axis=1))
test_err_PCR = Y_test.values - Y_hat
RMSE_PCR = np.sqrt(np.sum(test_err_PCR**2)/len(test_err_PCR))
print('The RMSE of factor augmented regression is %f'%RMSE_PCR)

PCR_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'PCR-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(nfactors_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
PCR_out = pd.DataFrame.from_dict(PCR_out)

In [None]:
#######################################################################################
####################################   PCRp    #########################################
#######################################################################################
W_train_stzd2 = (W_train_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_val_stzd2 = (W_val_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_test_stzd2 = (W_test_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)

W_train_stzd = W_train_stzd2[list(set(W_train_stzd2.columns).intersection(set(price_var)))]
W_val_stzd = W_val_stzd2[list(set(W_train_stzd2.columns).intersection(set(price_var)))]
W_test_stzd = W_test_stzd2[list(set(W_train_stzd2.columns).intersection(set(price_var)))]

Sigma_hat = W_train_stzd.T@W_train_stzd/n_train
eigval, eigvec = np.linalg.eig(Sigma_hat)
eigval = np.real(eigval)
eigvec = np.real(eigvec)
idx = eigval.argsort()[::-1]
eigval_sorted = eigval[idx]
eigvec_sorted = eigvec[:, idx]
F_train = W_train_stzd @ eigvec_sorted
F_val = W_val_stzd @ eigvec_sorted
F_val.columns = F_train.columns
F_test = W_test_stzd @ eigvec_sorted
F_test.columns = F_test.columns

# fig, ax = plt.subplots()
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,0], label='First basis')
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,1], label='Second basis')
# ax.plot(W_train_stzd.columns,eigvec_sorted[:,2], label='Third basis')
# plt.xticks(rotation=-45)
# plt.legend()
# plt.show()


nfactors_list = np.arange(1,20)
val_err = np.zeros((n_val, len(nfactors_list)))
nfactors = 2
PCRp_dict = {}
beta_dict_PCRp = {}
for cv_i, nfactors in enumerate(nfactors_list):

    PCRp_dict[cv_i] = LinearRegression(fit_intercept=True)
    PCRp_dict[cv_i].fit(np.concatenate((V_train.values,F_train.iloc[:,:nfactors]),axis=1), Y_train)
    
    Y_hat = PCRp_dict[cv_i].predict(np.concatenate((V_val.values,F_val.iloc[:,:nfactors]),axis=1))
    
    beta_dict_PCRp[cv_i] = PCRp_dict[cv_i].coef_[:2]
    val_err[:, cv_i] = Y_val.values-Y_hat

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_PCRp = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['PCRp'] = pd.DataFrame()
Validation_Err['PCRp']['nfactors_list'] = nfactors_list
Validation_Err['PCRp']['val_err'] = val_err_PCRp
beta_tot['PCRp'] = beta_dict_PCRp[min_idx]

plt.plot(nfactors_list, val_err_PCRp)
plt.xlabel('Number of principal components')
plt.title('Validation Error, PCRp,Minimum=%i'%nfactors_list[min_idx])
plt.show()
# plt.savefig("Figures/PCR_validation_seed%i.png"%seed)
# plt.close()
# # plt.show()

Y_hat = PCRp_dict[min_idx].predict(np.concatenate((V_test.values,F_test.iloc[:,:nfactors_list[min_idx]]),axis=1))
test_err_PCRp = Y_test.values - Y_hat
RMSE_PCRp = np.sqrt(np.sum(test_err_PCRp**2)/len(test_err_PCRp))
print('The RMSE of factor augmented regression (only price variables) is %f'%RMSE_PCRp)

PCRp_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'PCRp-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(nfactors_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
PCRp_out = pd.DataFrame.from_dict(PCRp_out)

In [None]:
#######################################################################################
################################# RKHS with l2 penalty ###############################
#######################################################################################
W_train_stzd = (W_train_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_val_stzd = (W_val_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)
W_test_stzd = (W_test_nnan - np.mean(W_train_nnan, axis=0))/np.std(W_train_nnan, axis = 0)

Sigma_hat = W_train_stzd.T@W_train_stzd/n_train
eigval, eigvec = np.linalg.eig(Sigma_hat)
eigval = np.real(eigval)
eigvec = np.real(eigvec)
idx = eigval.argsort()[::-1]
eigval_sorted = eigval[idx]
eigvec_sorted = eigvec[:, idx]

import numba
@numba.njit
def get_Gram(X,n,gamma):
    Gram_rbf = np.zeros((n,n))
    for t in range(n):
        Gram_rbf[t,:] = np.exp(-gamma*np.sum((X[t,:]-X)**2,1))
    return Gram_rbf

@numba.njit
def get_Gram_test(X_train,X_test,n_train,n_test,gamma):
    Gram_rbf = np.zeros((n_test,n_train))
    for t in range(n_test):
        Gram_rbf[t,:] = np.exp(-gamma*np.sum((X_test[t,:]-X_train)**2,1))
    return Gram_rbf

gamma = (1/W_test_stzd.shape[1])
Kernel_Gram = get_Gram(W_train_stzd.values, W_train_stzd.shape[0],gamma)
eigenvalues, eigenvectors = np.linalg.eigh(Kernel_Gram) 

idx = eigenvalues.argsort()[::-1]
eigval_sorted = eigenvalues[idx]
eigvec_sorted = eigenvectors[:, idx]

F_train = pd.DataFrame(eigvec_sorted*eigval_sorted)

K_val = get_Gram_test(W_train_stzd.values, W_val_stzd.values, n_train, n_val, gamma)
K_test = get_Gram_test(W_train_stzd.values, W_test_stzd.values, n_train, n_test, gamma)
F_val = pd.DataFrame(K_val@eigvec_sorted)
F_test = pd.DataFrame(K_test@eigvec_sorted)


nfactors_list = np.arange(1,20)
delta_hat_dict = {}
beta_hat_dict = {}
nfactors_price = 8

V_train_one = np.concatenate((np.ones((n_train,1)),V_train.values,),axis=1)
F_train_proj = (np.identity(n_train)-V_train_one@np.linalg.inv(V_train_one.T@V_train_one)@V_train_one.T)@F_train
lambda_list = np.linspace(0,50,100)
val_err = np.zeros((n_val, len(nfactors_list), len(lambda_list)))
for cv_i, nfactors in enumerate(nfactors_list):
    for cv_j, lam in enumerate(lambda_list):
        F_train_used = F_train.iloc[:,:nfactors]
        F_train_proj_used = F_train_proj.iloc[:,:nfactors]
        
        delta_hat = np.linalg.inv(F_train_used.T@F_train_proj_used + lam*np.diag(eigval_sorted[:nfactors]))@(F_train_proj_used.T@Y_train)
        # alpha_hat = eigvec_sorted[:,:nfactors]@delta_hat
        # beta_hat = np.linalg.inv(V_train.values.T@V_train.values)@V_train.values.T@(Y_train-Kernel_Gram@alpha_hat)
        beta_hat = np.linalg.inv(V_train_one.T@V_train_one)@V_train_one.T@(Y_train-(eigvec_sorted[:,:nfactors]*eigval_sorted[:nfactors])@delta_hat)
        
        Y_hat = np.concatenate((np.ones((n_val,1)),V_val.values,),axis=1)@beta_hat + F_val.iloc[:,:nfactors]@delta_hat
        delta_hat_dict[cv_i,cv_j] = delta_hat
        beta_hat_dict[cv_i,cv_j] = beta_hat
        val_err[:, cv_i,cv_j] = Y_val.values-Y_hat.values


min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
min_idx_i,min_idx_j = np.unravel_index(min_idx, np.mean(np.array(val_err)**2, axis=0).shape)
val_err_RKHS = np.mean(np.array(val_err)**2, axis=0)
val_err_RKHS[min_idx_i,min_idx_j]
np.min(val_err_RKHS)

Validation_Err['RKHS'] = {}
Validation_Err['RKHS']['nfactors_list'] = nfactors_list
Validation_Err['RKHS']['lambda_list'] = lambda_list
Validation_Err['RKHS']['val_err'] = val_err_RKHS
beta_tot['RKHS'] = beta_hat_dict[min_idx_i,min_idx_j][1:]



plt.plot(lambda_list, val_err_RKHS[min_idx_i,:])
plt.xlabel('alpha')
plt.title('Validation Error, RKHS, Dim=%i, argmin=%0.2f'%(nfactors_list[min_idx_i],lambda_list[min_idx_j]))
plt.show()

plt.plot(nfactors_list, val_err_RKHS[:,min_idx_j])
plt.xlabel('Number of principal components')
plt.title('Validation Error, RKHS, alpha=%0.2f, argmin=%i'%(lambda_list[min_idx_j],nfactors_list[min_idx_i]))
plt.show()

Y_hat = np.concatenate((np.ones((n_test,1)),V_test.values),axis=1)@beta_hat_dict[min_idx_i,min_idx_j] + F_test.iloc[:,:nfactors_list[min_idx_i]]@delta_hat_dict[min_idx_i,min_idx_j]
test_err_PCR = Y_test.values - Y_hat.values
RMSE_RKHS = np.sqrt(np.sum(test_err_PCR**2)/len(test_err_PCR))

RKHS_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat,
        'Model': 'PCR-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': 'alpha%f_nf%i'%(lambda_list[min_idx_j],nfactors_list[min_idx_i]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
RKHS_out = pd.DataFrame.from_dict(RKHS_out)


In [None]:
#######################################################################################
###################################   Neural Net    ###################################
#######################################################################################
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

batch_size = W_train_nnan.shape[0]
epochs = 20
n_node_list = np.linspace(1,37,19).astype(int)
val_err = np.zeros((n_val, len(n_node_list)))
model_NN_dict = {}
beta_dict_NN = {}
t = time.time()
for cv_i, n_node in enumerate(n_node_list):

    model1 = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(W_train_nnan.shape[1],)),
            tf.keras.layers.Dense(n_node, activation="relu"),
            # tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(num_lags)
        ]
    )
    
    model1.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))
    
    model2 = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(W_train_nnan.shape[1],)),
            tf.keras.layers.Dense(n_node, activation="relu"),
            # tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1)
        ]
    )
    
    model2.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))
    
    model_NN = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(W_train_nnan.shape[1],)),
            tf.keras.layers.Dense(n_node, activation="relu"),
            # tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1)
        ]
    )
    # model_NN.summary()
    
    model_NN.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))
    
    model1.fit(W_train_nnan, V_train, batch_size=batch_size,
            epochs=epochs)
    
    resi_v = V_train.values - model1.predict(W_train_nnan)
    model2.fit(W_train_nnan, Y_train, batch_size=batch_size,
            epochs=epochs)
    resi_y = Y_train.values - model2.predict(W_train_nnan).reshape(-1,)
    beta = np.linalg.inv(resi_v.T @ resi_v) @ (resi_v.T @ resi_y)
    resi_train = Y_train.values - V_train.values @ beta
    
    model_NN_dict[cv_i] = model_NN
    model_NN_dict[cv_i].fit(W_train_nnan, resi_train, batch_size=batch_size,
            epochs=epochs)
    resi_hat = model_NN_dict[cv_i].predict(W_val_nnan)
    Y_hat = V_val.values @ beta + resi_hat.reshape(-1,)
    
    beta_dict_NN[cv_i] = beta
    val_err[:, cv_i] = Y_val.values-Y_hat
elapsed = time.time() - t
print('elapsed time = %0.2f sec; %s'%(elapsed, datetime.datetime.now()))

min_idx = np.argmin(np.mean(np.array(val_err)**2, axis=0))
val_err_NN = np.mean(np.array(val_err)**2, axis=0)

Validation_Err['NN'] = pd.DataFrame()
Validation_Err['NN']['n_node_list'] = n_node_list
Validation_Err['NN']['val_err'] = val_err_NN
beta_tot['NN'] = beta_dict_NN[min_idx]

plt.plot(n_node_list, val_err_NN)
plt.xlabel('Number of Nodes')
plt.title('Validation Error, NN, Minimum=%i'%n_node_list[min_idx])
plt.show()
# plt.savefig("Figures/NN_validation_seed%i.png"%seed)
# plt.close()

Y_hat = V_test.values @ beta_dict_NN[min_idx] + model_NN_dict[min_idx].predict(W_test_nnan).reshape(-1,)
test_err_NN = Y_test.values - Y_hat
RMSE_NN = np.sqrt(np.sum(test_err_NN**2)/len(test_err_NN))
print('The RMSE of NN is %f'%RMSE_NN)

NN_out = {'Date': Date_used[forecast_idx].dt.strftime("%m/%d/%Y").values,
        'Target': 'Inflation',
        'Value': Y_test.values,
        'Prediction': Y_hat.reshape((-1,)),
        'Model': 'NN-PL-%ilags' %num_lags,
        'Seed': seed,
        'Parameter': str(n_node_list[min_idx]),
        'Window_size': n_train,
        'Validation_size': n_val,
        'Transformation': Transformation
        }
NN_out = pd.DataFrame.from_dict(NN_out)

In [None]:
# out = np.concatenate((RF_out.values, XGB_out.values, XGBs_out.values, PCR_out.values, NN_out.values), axis=0)

# query = ''' insert or replace into Results (Date,Target,Value,Prediction,Model,Seed,Parameter,Window_size,Validation_size,Transformation) values (?,?,?,?,?,?,?,?,?,?) '''
# cur.executemany(query, out)
# con.commit()
# con.close()

# file_name1 ='Validation_Err_PL_%ilags_seed%i_%s.pkl'%(num_lags,seed, Transformation.replace(' ','_'))
# file_name2 ='beta_tot_PL_%ilags_seed%i_%s.pkl'%(num_lags,seed,Transformation.replace(' ','_'))

# with open(os.path.join('Results', file_name1), 'wb') as outp:
#     pickle.dump(Validation_Err, outp)

# with open(os.path.join('Results', file_name2), 'wb') as outp:
#     pickle.dump(beta_tot, outp)