This script contains the CRNN codes for Seo,B.(2024), 'Econometric Forecasting Using Ubiquitous News Texts: Text-enhanced Factor Model' International Journal of Forecasting

-Last Update: 2024-11-24 <br>
-Author: Beomseok Seo (bsseo@sookmyung.ac.kr)

### 0. <font color=purple> Environment Setup

In [1]:
import os, copy, pickle
from os import walk
from pathlib import Path
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import statsmodels.api as sm
import statsmodels as sm_

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers



In [5]:
# To display pandas in full dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
# Set criteria
def rmse(y_pred,y_true):
    return np.sqrt(np.mean((y_pred-y_true)**2))
def mae(y_pred,y_true):
    return np.mean(np.abs(y_pred-y_true))

### 1. <font color='purple'>Load Dataset

In [7]:
TODAY = '240101'

In [8]:
all_dat = pd.read_csv('./data/all_dat_YoY.csv', index_col=0)
all_grp = pd.read_csv('./data/all_grp.csv', index_col=0)
macro_feat = pd.read_csv('./data/macro_feat.csv', index_col=0)

In [9]:
all_dat.index = pd.PeriodIndex(all_dat.index, freq='M')

In [10]:
all_grp_eng = copy.deepcopy(all_grp)

In [11]:
all_grp_eng_index = \
['GDP(SA)(Q)','GDP(NSA)(Q)',
 'Private consumption(SA)(Q)', 'Government consumption(SA)(Q)', 'Construction(SA)(Q)', 'Facility investment(SA)(Q)',
 'Exports of goods and services(SA)(Q)', 'Imports of goods and services(SA)(Q)',
 'Private consumption(NSA)(Q)', 'Government consumption(NSA)(Q)', 'Construction(NSA)(Q)', 'Facility investment(NSA)(Q)',
 'Exports of goods and services(NSA)(Q)', 'Imports of goods and services(NSA)(Q)',
 'Unemployment rate', 'Employment to population ratio', 'Number of employed people',
 'Monthly goods exports','Monthly goods imports',
 'Export price index','Import price index','Producer price index','Consumer price index',
 'Price index excluding agricultural product & oil', 'Price index excluding food & energy',
 'Consumption & Retail sales index(SA)','Service industry production index(SA)',
 'Consumption & Retail sales index(NSA)','Service industry production index(NSA)',
 'Manufacturing industry production index(SA)', 'Manufacturing industry shipment index(SA)', 'Manufacturing inventory index(SA)',
 'Manufacturing industry production index(NSA)', 'Manufacturing industry shipment index(NSA)', 'Manufacturing inventory index(NSA)',
 'Facility investment index(SA)','Construction completed(SA)',
 'Facility investment index(NSA)','Construction completed(NSA)',
 'Manufacturing business performance BSI(SA)','Manufacturing business performance BSI(NSA)',
 'All industries performance BSI', 'Service industry performance BSI', 'All industries sales BSI',
 'Manufacturing export BSI', 'Manufacturing domestic demand sales BSI', 'Manufacturing new orders BSI', 'Manufacturing operation rate BSI',
 'Economic sentiment index', 'Current economic judgment CSI', 'Consumer sentiment index',
 'Consolidated fiscal balance',
 'Housing sales price index(HSPI)-Seoul', 'Housing sales price index(HSPI)-National', 'Housing lease price index(HLPI)-Seoul', 'Housing lease price index(HLPI)-National',
 'Call rate','CD rate','KTB 3-year rate','KRW exchange rate', 'EUR exchange rate', 'KOSPI', 'KOSDAQ', 
 'Dubai crude oil', 'WTI futures', 'Gold futures',
 'Production(T)','Shipbuilding(T)','Automotive(T)','Semiconductor(T)','Facility investment(T)','Construction(T)',
 'Unemployment(T)','Recruitment(T)','Job search(T)',
 'Wholesale & retail(T)', 'Government expenditure(T)','Price outlook(T)','Stock price outlook(T)','House price outlook(T)',
 'World trade(T)',
 'News sentiment index(T)','Economic policy uncertainty(T)']

In [12]:
all_grp_eng.columns = ['External','Production','Labor','Consumption','Facility Inv.','Construction','Government','Exports','Imports','Prices','Finance','Real estate','Sentiments']

In [13]:
all_dat_eng = copy.deepcopy(all_dat)
all_dat_eng.columns = all_grp_eng_index
all_grp_eng.index = all_grp_eng_index

In [14]:
all_dat = all_dat_eng
all_grp = all_grp_eng

### 2. <font color='purple'>Generate vintage data

In [15]:
macro_lag = macro_feat['LAG'].values
finan_lag = np.zeros(14)
text_lag = np.zeros(17)

all_lag = np.concatenate([macro_lag,finan_lag,text_lag])

In [16]:
test_dates = [str(x) for x in all_dat.index if str(x)>='2016-01' and int(str(x)[-2:])%3==0][:-1]

In [17]:
def _GenVintage(all_dat):
    vintage_dict = dict()
    for i in test_dates:
        temp_dat = copy.deepcopy(all_dat)
        temp_dat = temp_dat.loc[:i]

        for j,l in enumerate(all_lag):
            if l==0:
                continue
            temp_dat.iloc[-int(l):,j] = np.nan

        vintage_dict[i] = temp_dat

    return vintage_dict

In [18]:
vintage_dict = _GenVintage(all_dat)

### 3. <font color='purple'>Experiments

#### convolutional recurrent neural network

In [19]:
factornames = {1:'External',2:'Production',3:'Labor',4:'Consumption',5:'Facility Inv.',6:'Construction',7:'Government',8:'Exports',9:'Imports',10:'Prices',11:'Finance',12:'Real estate',13:'Sentiments'}

In [20]:
factor_multiplicities = {'Global': 2}
factor_orders = {'Global': 4}

In [21]:
model_dict = dict()
results_dict = dict()

In [22]:
vintage_dict_keys = ['2016-03', '2016-06', '2016-09', '2016-12', '2017-03', '2017-06', '2017-09', '2017-12', '2018-03', '2018-06', '2018-09', '2018-12', '2019-03', '2019-06', '2019-09', '2019-12', '2020-03', '2020-06', '2020-09', '2020-12', '2021-03', '2021-06', '2021-09', '2021-12', '2022-03', '2022-06', '2022-09', '2022-12', '2023-03', '2023-06', '2023-09']

In [24]:
for fhor in ['now','1qa']:
    for trans in ['MoM','OS_MoM','NE_MoM','YoY','OS_YoY','NE_YoY']:

        all_dat_P = copy.deepcopy(all_dat)
        all_grp_P = copy.deepcopy(all_grp)

        if trans[:2] == 'NE':
            for i,x in enumerate(all_dat_P.columns):
                if x[-3:] != '(T)' and i>=14:
                    all_dat_P.drop(x,axis=1, inplace=True)
                    all_grp_P.drop(x,axis=0, inplace=True)

        if trans[:2] == 'OS':
            for i,x in enumerate(all_dat_P.columns):
                if x[-3:] == '(T)':
                    all_dat_P.drop(x,axis=1, inplace=True)
                    all_grp_P.drop(x,axis=0, inplace=True)  

        vitage_dict = _GenVintage(all_dat_P)
        
        
        pred_crnn = []
        err_crnn = []
        true_crnn = []
        #==========================================================================================
        for v in vintage_dict_keys:
            vintage_dat = vintage_dict[v]

            if trans[-3:] in ['YoY']:
                item = 'GDP(NSA)(Q)'
            elif trans[-3:] in ['MoM']:
                item = 'GDP(SA)(Q)'

            if fhor == 'now':
                vd_train_x_m = vintage_dat.iloc[12:-3,14:].fillna(method='ffill').fillna(method='bfill')
                vd_train_x_q = vintage_dat.iloc[9:-6,:14].dropna()
                vd_train_y_q = vintage_dat.iloc[12:-3,:14][item].dropna()

                testing_x_m = np.expand_dims(vintage_dat.iloc[-36:,14:].fillna(method='ffill'),(0,3))
                testing_x_q = np.expand_dims(vintage_dat.iloc[-39:-3,:14].dropna(),0)
                testing_y_q = np.expand_dims(all_dat.loc[str(vintage_dat.iloc[[-1],:].index[0]), item],0)
            elif fhor == '1qa':
                vd_train_x_m = vintage_dat.iloc[9:-6,14:].fillna(method='ffill').fillna(method='bfill')
                vd_train_x_q = vintage_dat.iloc[6:-9,:14].dropna()
                vd_train_y_q = vintage_dat.iloc[12:-3,:14][item].dropna()

                testing_x_m = np.expand_dims(vintage_dat.iloc[-39:-3,14:].fillna(method='ffill'),(0,3))
                testing_x_q = np.expand_dims(vintage_dat.iloc[-42:-6,:14].dropna(),0)
                testing_y_q = np.expand_dims(all_dat.loc[str(vintage_dat.iloc[[-1],:].index[0]), item],0)

            ds_train_x_m = tf.keras.utils.timeseries_dataset_from_array(vd_train_x_m, targets=None, sequence_length=36, sequence_stride=3)
            ds_train_x_q = tf.keras.utils.timeseries_dataset_from_array(vd_train_x_q, None, sequence_length=12, sequence_stride=1)
            ds_train_y_q = tf.keras.utils.timeseries_dataset_from_array(vd_train_y_q, None, sequence_length=1, sequence_stride=1, start_index=11)


            train_x_m = np.transpose(np.stack(list(ds_train_x_m)),(1,2,3,0))
            train_x_q = np.squeeze(np.transpose(np.stack(list(ds_train_x_q)),(1,2,3,0)))
            train_y_q = np.squeeze(np.stack(list(ds_train_y_q)))

            print(np.sum(np.isnan(train_x_m)),np.sum(np.isnan(train_x_q)),np.sum(np.isnan(train_y_q)))
            print(train_x_m.shape, train_x_q.shape, train_y_q.shape)

            tf_train_x_mq = tf.data.Dataset.from_tensor_slices((train_x_m,train_x_q))
            tf_train_y_q = tf.data.Dataset.from_tensor_slices(train_y_q)

            train_ds = tf.data.Dataset.zip((tf_train_x_mq, tf_train_y_q)).batch(32).repeat()

            tf_testing_x_mq = tf.data.Dataset.from_tensor_slices((testing_x_m,testing_x_q))
            tf_testing_y_q = tf.data.Dataset.from_tensor_slices(testing_y_q)

            testing_ds = tf.data.Dataset.zip((tf_testing_x_mq, tf_testing_y_q)).batch(32).repeat()


            #---------------------------------------------------------------------------------------------------
            inp_m = tf.keras.layers.Input(shape=train_x_m.shape[1:])
            inp_q = tf.keras.layers.Input(shape=train_x_q.shape[1:])
            #---------------------------------------------------------------------------------------------------
            xm = tf.keras.layers.Conv2D(filters=64, kernel_size=(3,vd_train_x_m.shape[1]), strides=3, activation='relu', use_bias=True)(inp_m)
            xm = tf.keras.layers.Lambda(lambda x: x[:, :, 0, :])(xm)
            xmq = tf.keras.layers.Lambda(lambda x: tf.concat(x,axis=-1))([xm,inp_q])
            xx = tf.keras.layers.LSTM(32)(xmq)
            xx = tf.keras.layers.Dense(4, activation='relu', use_bias=True)(xx)
            out = tf.keras.layers.Dense(1, activation='linear', use_bias=True)(xx)
            
            #---------------------------------------------------------------------------------------------------
            model_lin = tf.keras.models.Model([inp_m,inp_q],out)
            #---------------------------------------------------------------------------------------------------

            EVALUATION_INTERVAL = 500
            EPOCHS = 5

            model_lin.compile(loss=tf.losses.MeanAbsoluteError(),
                        optimizer=tf.optimizers.Adam(learning_rate = 0.002),
                        metrics=[tf.metrics.MeanSquaredError()])

            history = model_lin.fit(train_ds,
                                steps_per_epoch=EVALUATION_INTERVAL,
                                epochs=EPOCHS)#, callbacks=[early_stopping])


            Path("./outs/crnn_out_{}_{}".format(trans,fhor)).mkdir(parents=True, exist_ok=True)
            model_lin.save('./outs/crnn_out_{}_{}/crnn_{}.keras'.format(trans,fhor,v))

            pred_nc = model_lin.predict((testing_x_m,testing_x_q))[0][0]

            pred_crnn.append(pred_nc)
            err_crnn.append(testing_y_q[0]-pred_nc)
            true_crnn.append(testing_y_q[0])
            print(v, 'pred:',pred_nc, 'true:',testing_y_q[0], 'err:',testing_y_q[0]-pred_nc)

        crnn_ = pd.DataFrame({'index':vintage_dict_keys, 'true':true_crnn, 'pred':pred_crnn,'err':err_crnn})
        crnn_.set_index('index', inplace=True)

#         Path("./outs/collect").mkdir(parents=True, exist_ok=True)
#         crnn_.to_csv('./outs/collect/crnn_{}_{}.csv'.format(trans,fhor))
#         print(fhor, trans, np.mean(np.abs(crnn_['err'])))            