## LGB starter

In this notebook:
* I build simple features from book and trade datasets;
* I train a lightgbm model **with weights** with a custom metric (RMSPE) and obtain a CV score;

Credits to:
* https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data
* https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
* https://www.kaggle.com/swimmy/optiver-lgb-with-optimized-params



In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import statistics
import gc
import scipy.stats as scipy

from statsmodels.tsa.arima_model import ARIMA

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
import lightgbm as lgb

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns

path_root = '../input/optiver-realized-volatility-prediction'
path_data = '../input/optiver-realized-volatility-prediction'
path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [None]:
#added feature engineering functions
#simple average
def simple_averaging(series):
    return np.mean(series)

def variance(series):
    return np.var(series)

#standard distribution
def std(series):
    return np.std(series)

def skew(series):
    return scipy.skew(series)

def kurtosis(series):
    return scipy.kurtosis(series)

#0.25 quantile
def quantile_1(series):
    return np.quantile(series,0.25)

#0.5 quantile
def quantile_2(series):
    return np.quantile(series,0.5)

#0.75 quantile
def quantile_3(series):
    return np.quantile(series,0.75)

#ARIMA Model
def arima(series):
    model = ARIMA(series, order=(1,0,0))
    results = model.fit()
    return results.fittedvalues

# imported from https://www.kaggle.com/tommy1028/lightgbm-starter-with-feature-engineering-idea
def count_unique(series):
    return len(np.unique(series))

#original ones
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() # the log return of two prices

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2)) # realized volatility given series log returns

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))) # rmsp error used for grading

def book_preprocessor(df_book, stock_id):    
    #additional feature code, added params
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df_book[df_book['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg([simple_averaging,variance,std,skew,kurtosis,quantile_1,quantile_2,quantile_3, count_unique]).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature


def get_stock_stat(stock_id : int, dataType = 'train'):
    key = ['stock_id', 'time_id', 'seconds_in_bucket']
    
    #Book features
    df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id))) # get data for one particular stockid
    df_book['stock_id'] = stock_id
    cols = key + [col for col in df_book.columns if col not in key] # add some columns to the key
    df_book = df_book[cols] # filter out things not in columns
    
    df_book['price_spread'] = (df_book['ask_price1'] - df_book['bid_price1']) / ((df_book['ask_price1'] + df_book['bid_price1'])/2)
    df_book['bid_spread'] = df_book['bid_price1'] - df_book['bid_price2']
    df_book['ask_spread'] = df_book['ask_price1'] - df_book['ask_price2']
    df_book['total_volume'] = (df_book['ask_size1'] + df_book['ask_size2']) + (df_book['bid_size1'] + df_book['bid_size2'])
    df_book['volume_imbalance'] = abs((df_book['ask_size1'] + df_book['ask_size2']) - (df_book['bid_size1'] + df_book['bid_size2']))
    
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                    df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1']) #wap for best offer
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                    df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2']) #wap for second best
    df_book['wap_balance'] = abs(df_book['wap1'] - df_book['wap2'])
    
    df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
    df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)# log return calculation
    
    features_to_apply_realized_volatility = ['log_return'+str(i+1) for i in range(2)] #['logreturn1','logreturn2']
    stock_stat = df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_realized_volatility]\
                        .agg(realized_volatility).reset_index() #doing some regrouping
    
    #other features to look at 
    features_to_apply_common_stats=['price_spread','bid_spread','ask_spread','total_volume','volume_imbalance','wap1','wap2','wap_balance']
    more_stock_stat=df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_common_stats]\
    .agg([simple_averaging,variance,std,skew,kurtosis,quantile_1,quantile_2,quantile_3, count_unique]).reset_index()
    helper_stock_stat=book_preprocessor(df_book,stock_id)
    # merging the new features with the old
    stock_stat = stock_stat.merge(more_stock_stat, # more_stock_stat is the new features engineered
                                  on=['stock_id', 'time_id'], 
                                  how='left').fillna(-999) #merge two datasets
    stock_stat = stock_stat.merge(helper_stock_stat, # more_stock_stat is the new features engineered
                                  on=['stock_id', 'time_id'], 
                                  how='left').fillna(-999) #merge two datasets
    
    #Trade features
    trade_stat =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    trade_stat = trade_stat.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    trade_stat['stock_id'] = stock_id
    cols = key + [col for col in trade_stat.columns if col not in key]
    trade_stat = trade_stat[cols]
    trade_stat['trade_log_return1'] = trade_stat.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0) #some feature engineering
    trade_stat = trade_stat.groupby(by = ['stock_id', 'time_id'])[['trade_log_return1']]\
                           .agg(realized_volatility).reset_index()
    #Joining book and trade features
    stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999) #merge two datasets
    
    return stock_stat

def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    ) #some parallel loading?
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False #getting results

params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'lambda_l2': 1,
        'verbose': -1
        #'bagging_freq': 5
}

## Train and test datasets

In [None]:
train = pd.read_csv(os.path.join(path_data, 'train.csv')) #reading and filtering  some data


DEBUG=True
if DEBUG:
    stock_ids = train['stock_id'].unique()[:5]
else:
    stock_ids = train['stock_id'].unique()

In [None]:
# train = pd.read_csv(os.path.join(path_data, 'train.csv')) #reading and filtering  some data
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')


In [None]:
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
print('Train shape: {}'.format(train.shape))
display(train.head(2))

test = pd.read_csv(os.path.join(path_data, 'test.csv'))
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Test shape: {}'.format(test.shape))
display(test.head(2))

## Training model and making predictions

In [None]:
all_columns = train.columns.tolist()
all_columns_test = test.columns.tolist()

for ix, col in enumerate(all_columns):
    if isinstance(col, tuple):
        all_columns[ix] = "_".join(col)

for ix, col in enumerate(all_columns_test):
    if isinstance(col, tuple):
        all_columns_test[ix] = "_".join(col)
        
train.columns = all_columns
test.columns = all_columns_test

In [None]:
cats = ['stock_id']
model_name = 'lgb1'
pred_name = 'pred_{}'.format(model_name)
features_to_consider = ['stock_id', 'log_return1', 'log_return2', 'trade_log_return1', 
                        'wap1_simple_averaging','wap2_simple_averaging',
                        'wap_balance_simple_averaging','price_spread_simple_averaging',
                        'bid_spread_simple_averaging','ask_spread_simple_averaging',
                        'total_volume_simple_averaging','volume_imbalance_simple_averaging',
                        'wap1_variance','wap2_variance','wap_balance_variance','price_spread_variance',
                        'bid_spread_variance','ask_spread_variance','total_volume_variance','volume_imbalance_variance',
                        'wap1_std','wap2_std','wap_balance_std','price_spread_std',
                        'bid_spread_std','ask_spread_std','total_volume_std','volume_imbalance_std',
                        'wap1_skew','wap2_skew','wap_balance_skew','price_spread_skew',
                        'bid_spread_skew','ask_spread_skew','total_volume_skew','volume_imbalance_skew',
                        'wap1_kurtosis','wap2_kurtosis','wap_balance_kurtosis','price_spread_kurtosis',
                        'bid_spread_kurtosis','ask_spread_kurtosis','total_volume_kurtosis','volume_imbalance_kurtosis',
                        'wap1_quantile_1','wap2_quantile_1','wap_balance_quantile_1','price_spread_quantile_1',
                        'bid_spread_quantile_1','ask_spread_quantile_1','total_volume_quantile_1','volume_imbalance_quantile_1',
                        'wap1_quantile_2','wap2_quantile_2','wap_balance_quantile_2','price_spread_quantile_2',
                        'bid_spread_quantile_2','ask_spread_quantile_2','total_volume_quantile_2','volume_imbalance_quantile_2',
                        'wap1_quantile_3','wap2_quantile_3','wap_balance_quantile_3','price_spread_quantile_3',
                        'bid_spread_quantile_3','ask_spread_quantile_3','total_volume_quantile_3','volume_imbalance_quantile_3',
                        'wap1_count_unique','wap2_count_unique','wap_balance_count_unique','price_spread_count_unique',
                        'bid_spread_count_unique','ask_spread_count_unique','total_volume_count_unique','volume_imbalance_count_unique',
                        
                       ]# some parameters in this part
print('We consider {} features'.format(len(features_to_consider)))

train[pred_name] = 0
test['target'] = 0

n_folds = 4
n_rounds = 5000
kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=2016) #not sure what is this part
scores_folds[model_name] = []
counter = 1
for dev_index, val_index in kf.split(range(len(train))): 
    print('CV {}/{}'.format(counter, n_folds))
    X_train = train.loc[dev_index, features_to_consider]
    y_train = train.loc[dev_index, target_name].values
    X_val = train.loc[val_index, features_to_consider]
    y_val = train.loc[val_index, target_name].values
    
    #############################################################################################
    #LGB
    #############################################################################################
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train,2))
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cats, weight=1/np.power(y_val,2))
    
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=500
                     )
    preds = model.predict(train.loc[val_index, features_to_consider])
    train.loc[val_index, pred_name] = preds
    score = round(rmspe(y_true = y_val, y_pred = preds),5)
    print('Fold {} {}: {}'.format(counter, model_name, score))
    scores_folds[model_name].append(score)
    counter += 1
    test[target_name] += model.predict(test[features_to_consider]).clip(0,1e10)
del train_data, val_data
test[target_name] = test[target_name]/n_folds

score = round(rmspe(y_true = train[target_name].values, y_pred = train[pred_name].values),5)
print('RMSPE {}: {} - Folds: {}'.format(model_name, score, scores_folds[model_name])) #getting the scores and stuff

display(test[['row_id', target_name]].head(2))
test[['row_id', target_name]].to_csv('submission.csv',index = False)

importances = pd.DataFrame({'Feature': model.feature_name(), 
                            'Importance': model.feature_importance(importance_type='gain')})
importances.sort_values(by = 'Importance', inplace=True)
importances2 = importances.nlargest(50,'Importance', keep='first').sort_values(by='Importance', ascending=True)
importances2[['Importance', 'Feature']].plot(kind = 'barh', x = 'Feature', figsize = (8,6), color = 'blue', fontsize=11);plt.ylabel('Feature', fontsize=12)