In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgbm
from lightgbm import *

In general, it is known that financial market data is prone to overfitting. Therefore, when applying machine learning to financial market prediction, it is necessary to focus on countermeasures against overfitting.

For example, in the Two Sigma Financial Modeling Challenge held by Kaggle in 2016, the top winners used Extra Trees. This is a method similar to Random Forest, but instead of combining multiple decision trees to make predictions, it creates trees with randomly selected features and makes predictions by bagging them. Since the selection of features in each tree is random, this method is less prone to overfitting compared to Random Forest.

In this code, I will introduce DAE (Denosing Autoencoder) as one of the methods to prevent overfitting. There are various types of DAE, but in this case, I will use Gaussian noise, which is noise that follows a normal distribution of values, as the input.This is a method of feature extraction by adding noise to the Autoencoder, and unlike the conventional Autoencoder, it is expected to be able to extract more robust features since noise is added.

There are various types of DAE, but in this article, we will introduce a DAE that adds Gaussian noise to the input, a noise that follows a normal distribution.


In [None]:
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
     
    return df

df = reduce_mem_usage(df)

The train data is divided into the feature data (X_0, X_1, . , X_4), and target data (Y_0, Y_1, ..., Y_4) in time series order.

In [None]:
from sklearn.model_selection import KFold, train_test_split


features = [f'f_{i}' for i in range(300)]
target = 'target'

X_012, X_34, Y_012, Y_34 = train_test_split(df[features], df[target], train_size=0.6, shuffle=False)

X_3, X_4, Y_3, Y_4 = train_test_split(X_34, Y_34, train_size=0.5, shuffle=False)

X_0, X_12, Y_0, Y_12 = train_test_split(X_012, Y_012, train_size=0.33, shuffle=False)

X_1, X_2, Y_1, Y_2 = train_test_split(X_12, Y_12, train_size=0.5, shuffle=False)

df = [[]]

X_0 = X_0.reset_index()
X_1 = X_1.reset_index()
X_2 = X_2.reset_index()
X_3 = X_3.reset_index()
X_4 = X_4.reset_index()

Y_0 = Y_0.reset_index()
Y_1 = Y_1.reset_index()
Y_2 = Y_2.reset_index()
Y_3 = Y_3.reset_index()
Y_4 = Y_4.reset_index()

del X_0['index']
del X_1['index']
del X_2['index']
del X_3['index']
del X_4['index']
del Y_0['index']
del Y_1['index']
del Y_2['index']
del Y_3['index']
del Y_4['index']

First, I will train the DAE with X_0 and Y_0, and use X_1 and Y_1 as validation data.

In [None]:
import keras
from keras import layers
from keras.layers import Input, Dense, BatchNormalization
from keras.layers.noise import GaussianNoise
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint

# EaelyStopping
early_stopping =  EarlyStopping(
                            monitor='val_loss',
                            min_delta=0.0,
                            patience=3,
)

input_dim = X_0.shape[1]
encoding_dim = 100

input_layer = Input(shape=(input_dim, ))
encoder = GaussianNoise(0.1)(input_layer)
encoder = Dense(encoding_dim, activation="tanh")(encoder)
encoder = BatchNormalization()(encoder)
encoder = Dense(10, activation="relu")(encoder)

decoder = Dense(10, activation='tanh')(encoder)
decoder = Dense(encoding_dim, activation='relu')(decoder)
output_layer = Dense(1, activation='tanh')(decoder)

autoencoder = Model(inputs=input_layer, outputs=output_layer)

autoencoder.compile(optimizer="adam", loss='mean_squared_error', metrics=["mse"])

modelCheckpoint = ModelCheckpoint(filepath = 'XXX.h5',
                                  monitor='val_loss',
                                  verbose=1,
                                  save_best_only=True,
                                  save_weights_only=False,
                                  mode='min',
                                  save_freq=1)


autoencoder.fit(X_0, Y_0, batch_size=1440, epochs=11890,
          validation_data=(X_1, Y_1),
          callbacks=[early_stopping] # CallBacks
            )

Then, when the training is finished, apply the trained DAE to X_2~X_4 to create new features.

In [None]:
X_0 = [[]]
X_1 = [[]]
Y_0 = []
Y_1 = []
df = [[]]


dae= Model(input_layer, encoder)

encoding_X_2 = dae.predict(X_2)
dae_X_2 = pd.DataFrame(encoding_X_2)
encoding_X_3 = dae.predict(X_3)
dae_X_3 = pd.DataFrame(encoding_X_3)
encoding_X_4 = dae.predict(X_4)
dae_X_4 = pd.DataFrame(encoding_X_4)

X_2 = pd.concat([X_2, dae_X_2], axis=1)
X_3 = pd.concat([X_3, dae_X_3], axis=1)
X_4 = pd.concat([X_4, dae_X_4], axis=1)

For example, X_2 looks like this after using DAE.

In [None]:
print(X_2.head(4))

Next, I will use LightGBM.
I will use X_2 and Y_2 for train data, X_3 and Y_3 for val data, and X_4 and Y_4 for test data.

In [None]:
import warnings
import numpy as np
import lightgbm as lgb
from scipy.stats import pearsonr

warnings.simplefilter('ignore')

lgb_train = lgb.Dataset(X_2, Y_2)
lgb_eval = lgb.Dataset(X_3, Y_3, reference=lgb_train)

params = {'seed': 1,
          'verbose' : -1,
           'objective': "regression",
           'learning_rate': 0.02,
           'bagging_fraction': 0.2,
           'bagging_freq': 1,
           'feature_fraction': 0.3,
           'max_depth': 5,
           'min_child_samples': 50,
           'num_leaves': 64}
        
        
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                verbose_eval=False,
                early_stopping_rounds=10,
                )


Y_pred = gbm.predict(X_4, num_iteration=gbm.best_iteration)
l_4 = Y_4['target'].values.tolist()
score_tuple = pearsonr(l_4, Y_pred)
score = score_tuple[0]
print(f"Validation Pearsonr score : {score:.4f}")

When the LightGBM training is finished, display the feature importance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


feature = gbm.feature_importance(importance_type='gain')


f = pd.DataFrame({'number': range(0, len(feature)),
             'feature': feature[:]})
f2 = f.sort_values('feature',ascending=False)

#features' name
label = X_2.columns[0:]

#feature rank
indices = np.argsort(feature)[::-1]

for i in range(len(feature)):
    print(str(i + 1) + "   " + str(label[indices[i]]) + "   " + str(feature[indices[i]]))

These '0'~'9' are the new features added by DAE.
When we check the feature importance, we can see that the features added by DAE are indeed effective.