In [None]:
import numpy as np 
import pandas as pd
import pyarrow.parquet as pq
import glob
import os
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.svm import SVR  
import xgboost as xgb
import lightgbm as lgbm

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
sample = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
list_order_trade_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
list_order_trade_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])   / (df_book_data['bid_size1']+ df_book_data['ask_size1'])
    df_book_data['wap2'] =(df_book_data['bid_price2'] * df_book_data['ask_size2']+df_book_data['ask_price2'] * df_book_data['bid_size2'])  / (df_book_data['bid_size2']+ df_book_data['ask_size2'])
    df_book_data['wap3'] =(df_book_data['bid_price1'] * df_book_data['bid_size1']+df_book_data['ask_price1'] * df_book_data['ask_size1'])  / (df_book_data['bid_size1']+ df_book_data['ask_size1'])
    df_book_data['wap4'] =(df_book_data['bid_price2'] * df_book_data['bid_size2']+df_book_data['ask_price2'] * df_book_data['ask_size2'])  / (df_book_data['bid_size2']+ df_book_data['ask_size2'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data['log_return2'] = df_book_data.groupby(['time_id'])['wap2'].apply(log_return)
    df_book_data['log_return3'] = df_book_data.groupby(['time_id'])['wap3'].apply(log_return)
    df_book_data['log_return4'] = df_book_data.groupby(['time_id'])['wap4'].apply(log_return)
    df_book_data['price_spread'] = (df_book_data['ask_price1'] - df_book_data['bid_price1']) / ((df_book_data['ask_price1'] + df_book_data['bid_price1']) / 2)
    df_book_data['price_spread2'] = (df_book_data['ask_price2'] - df_book_data['bid_price2']) / ((df_book_data['ask_price2'] + df_book_data['bid_price2']) / 2)
    df_book_data['bid_spread'] = df_book_data['bid_price1'] - df_book_data['bid_price2']
    df_book_data['ask_spread'] = df_book_data['ask_price1'] - df_book_data['ask_price2']
    df_book_data["bid_ask_spread"] = abs(df_book_data['bid_spread'] - df_book_data['ask_spread'])
    df_book_data['total_volume'] = (df_book_data['ask_size1'] + df_book_data['ask_size2']) + (df_book_data['bid_size1'] + df_book_data['bid_size2'])
    df_book_data['volume_imbalance'] = abs((df_book_data['ask_size1'] + df_book_data['ask_size2']) - (df_book_data['bid_size1'] + df_book_data['bid_size2']))
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['wap','wap2','wap3','wap4','log_return','log_return2','log_return3','log_return4','price_spread','price_spread2','bid_spread','ask_spread','bid_ask_spread','total_volume','volume_imbalance'].agg(realized_volatility)).reset_index()
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,prediction_column_name='pred')

In [None]:
df_past_realized_train

In [None]:
df_past_realized_test = past_realized_volatility_per_stock(list_file=list_order_book_file_test,prediction_column_name='pred')

In [None]:
df_past_realized_test

In [None]:
vol_cols = ['wap','wap2','wap3','wap4','log_return','log_return2','log_return3','log_return4','price_spread','price_spread2','bid_spread','ask_spread','bid_ask_spread','total_volume','volume_imbalance']

In [None]:
def get_time_stock(df):
    vol_cols = ['wap','wap2','wap3','wap4','log_return','log_return2','log_return3','log_return4','price_spread','price_spread2','bid_spread','ask_spread','bid_ask_spread','total_volume','volume_imbalance']
   
    
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['time_id__time'], axis = 1, inplace = True)
    return df

In [None]:
"""df_stock_id = df_past_realized_test.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
df_stock_id = df_stock_id.add_suffix('_' + 'stock')
df_stock_id"""

df_past_realized_test = get_time_stock(df_past_realized_test)

In [None]:
df_past_realized_test['row_id']

In [None]:
df_past_realized_train = get_time_stock(df_past_realized_train)

In [None]:
df_past_realized_train

In [None]:
def realized_volatility_per_time_id2(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['price'] * df_book_data['size']) / (df_book_data['order_count'])
    
    df_book_data['log_return3'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return3'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return3'].agg(realized_volatility)).reset_index()
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock

In [None]:
def past_realized_volatility_per_stock2(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,realized_volatility_per_time_id2(file,prediction_column_name)])
    return df_past_realized
#df_past_realized_trade_train = past_realized_volatility_per_stock2(list_file=list_order_trade_file_train,prediction_column_name='pred')

In [None]:
#df_past_realized_trade_test = past_realized_volatility_per_stock2(list_file=list_order_trade_file_test,prediction_column_name='pred')

In [None]:
#df_past_realized_trade_train

In [None]:
#train2 = pd.merge(df_past_realized_train,df_past_realized_trade_train,on='row_id', how='outer')
train2 = df_past_realized_train

In [None]:
#test2 = pd.merge(df_past_realized_test,df_past_realized_trade_test,on='row_id', how='outer')
test2 = df_past_realized_test

In [None]:
test2

In [None]:
train2

In [None]:
train

In [None]:
train['row_id']=[str(train['stock_id'][i])+'-'+str(train['time_id'][i]) for i in range(len(train))]
train

In [None]:
test['row_id']=[str(test['stock_id'][i])+'-'+str(test['time_id'][i]) for i in range(len(test))]
test

In [None]:
train = pd.merge(train, train2,on='row_id', how='outer')

In [None]:
test = pd.merge(test, test2,on='row_id', how='outer')

In [None]:
test

In [None]:
test = test.fillna(test.mean())

In [None]:
test

In [None]:
train

In [None]:
train = train.fillna(train.mean())

In [None]:
columns = train.columns

In [None]:
columns

In [None]:
xtrain_labels = ['wap', 'wap2',
       'wap3', 'wap4', 'log_return', 'log_return2', 'log_return3',
       'log_return4', 'price_spread', 'price_spread2', 'bid_spread',
       'ask_spread', 'bid_ask_spread', 'total_volume', 'volume_imbalance',
       'wap_mean_time', 'wap_std_time', 'wap_max_time', 'wap_min_time',
       'wap2_mean_time', 'wap2_std_time', 'wap2_max_time', 'wap2_min_time',
       'wap3_mean_time', 'wap3_std_time', 'wap3_max_time', 'wap3_min_time',
       'wap4_mean_time', 'wap4_std_time', 'wap4_max_time', 'wap4_min_time',
       'log_return_mean_time', 'log_return_std_time', 'log_return_max_time',
       'log_return_min_time', 'log_return2_mean_time', 'log_return2_std_time',
       'log_return2_max_time', 'log_return2_min_time', 'log_return3_mean_time',
       'log_return3_std_time', 'log_return3_max_time', 'log_return3_min_time',
       'log_return4_mean_time', 'log_return4_std_time', 'log_return4_max_time',
       'log_return4_min_time', 'price_spread_mean_time',
       'price_spread_std_time', 'price_spread_max_time',
       'price_spread_min_time', 'price_spread2_mean_time',
       'price_spread2_std_time', 'price_spread2_max_time',
       'price_spread2_min_time', 'bid_spread_mean_time', 'bid_spread_std_time',
       'bid_spread_max_time', 'bid_spread_min_time', 'ask_spread_mean_time',
       'ask_spread_std_time', 'ask_spread_max_time', 'ask_spread_min_time',
       'bid_ask_spread_mean_time', 'bid_ask_spread_std_time',
       'bid_ask_spread_max_time', 'bid_ask_spread_min_time',
       'total_volume_mean_time', 'total_volume_std_time',
       'total_volume_max_time', 'total_volume_min_time',
       'volume_imbalance_mean_time', 'volume_imbalance_std_time',
       'volume_imbalance_max_time', 'volume_imbalance_min_time']

In [None]:
len(xtrain_labels)

In [None]:
xtrain=train[xtrain_labels]

In [None]:
xtrain.isna().any()

In [None]:
ytrain = train['target']

In [None]:
xtest = test[xtrain_labels]

In [None]:
xtest

In [None]:
xtrain = np.array(xtrain)
ytrain = np.array(ytrain)
xtest  = np.array(xtest)

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mse(y_true,y_pred))

In [None]:
import optuna
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2, random_state=42)

In [None]:
X_train = ""
X_test = ""
y_train = ""
y_test = ""

In [None]:
def objective(trial):
    param = {
        'metric' : 'rmse',
        'lambda' : trial.suggest_loguniform('lambda' , 1e-7 , 1.0),
        'alpha' : trial.suggest_loguniform('alpha' , 1e-5 , 1.0),
        'colsample_bytree' : trial.suggest_uniform('colsample_bytree' , 0 , 1.0),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.02),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'max_depth' : trial.suggest_int('max_depth' , 1 , 20),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021]),
        'min_child_weight' : trial.suggest_int('min_child_weight' , 1 , 300)
    }
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_test)
    train_dataset = lgbm.Dataset(X_train, y_train, weight = train_weights)
    val_dataset = lgbm.Dataset(X_test, y_test, weight = val_weights)
    models = lgbm.train(params = params,
                          num_boost_round=1300,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,feval = feval_rmspe
                      )
    
    preds = models.predict(X_test)
    acc = rmse_score(y_test, preds)
    return acc


#study = optuna.create_study(direction="minimize")
#study.optimize(objective, n_trials=50)

In [None]:
#params = study.best_trial.params
#params

In [None]:
def get_preds_svm(X,y,X_test,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model = SVR(C=C,kernel=kernel,gamma='auto')
        train_x,train_y,val_x,val_y=X[train_idx], y[train_idx],X[valid_idx], y[valid_idx]
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        score = rmse_score(prediction,val_y)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
#svm_pred = get_preds_svm(xtrain,ytrain,xtest)

In [None]:
def get_preds_xgr(X,y,X_test,nfolds=10,C=10,kernel='rbf'):
    scores = []
    preds = np.zeros((X_test.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model = xgb.XGBRegressor(**params) 
        train_x,train_y,val_x,val_y=X[train_idx], y[train_idx],X[valid_idx], y[valid_idx]
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        score = rmse_score(prediction,val_y)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
#xgr_pred = get_preds_xgr(xtrain,ytrain,xtest)

In [None]:
seed0=42
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [None]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
def get_preds_lgbm(X,y,Xtest,nfolds=10):
    scores = []
    preds = np.zeros((Xtest.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        X_trains,y_trains,X_tests,y_tests=X[train_idx], y[train_idx],X[valid_idx], y[valid_idx]
        """ X_train = X_trains
        y_train = y_trains
        X_test = X_tests
        y_test = y_tests
        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=50)
        params = study.best_trial.params"""
        train_weights = 1 / np.square(y_trains)
        val_weights = 1 / np.square(y_tests)
        train_dataset = lgbm.Dataset(X_trains, y_trains, weight = train_weights)
        val_dataset = lgbm.Dataset(X_tests, y_tests, weight = val_weights)
        model = lgbm.train(params = params,
                          num_boost_round=1300,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        prediction = model.predict(X_tests)
        score = rmse_score(prediction,y_tests)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(Xtest)
    lgbm.plot_importance(model,max_num_features=20)   
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
lgbm_pred = get_preds_lgbm(xtrain,ytrain,xtest)

In [None]:
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM,Dropout,concatenate,Input
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Activation, Embedding, LSTM,Dropout,Bidirectional,GRU
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Flatten ,Embedding,Input,Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Dropout,MaxPooling1D,Bidirectional,GRU,Concatenate
from keras.models import Sequential,Model
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
import keras

In [None]:
def crt_model():
    i1=Input(shape=(75))
    l5=Dense(128, kernel_initializer='normal',activation='relu')(i1)
    l5=Dense(256, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(512, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(256, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(128, kernel_initializer='normal',activation='relu')(l5)
    l5=Dense(64, kernel_initializer='normal',activation='relu')(l5)
    l7=Dense(1, kernel_initializer='normal')(l5)
    model=Model(inputs=i1, outputs=l7)
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=[keras.metrics.MeanSquaredError()])
    return model


In [None]:
model=crt_model()
#keras.utils.plot_model(model)

In [None]:
model.summary()

In [None]:
def get_res(train_embedd,target,test_embedd):
    nfolds = 10
    scores =[]
    preds = np.zeros((test_embedd.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model=crt_model()
        train_x,train_y,test_x,test_y=train_embedd[train_idx], target[train_idx],train_embedd[valid_idx], target[valid_idx]
        traindata=[train_x for i in range(1)]
        val=[test_x for i in range(1)]
        model.fit(traindata,train_y,epochs=100,validation_data=(val,test_y),batch_size=128)
        y_pred=model.predict(val)
        score = rmse_score(y_pred,test_y)
        scores.append(score)
        print(f'Fold {k} , rmse score: {score}')
        test=[test_embedd for i in range(1)]
        y_preds = model.predict(test)
        y_preds=y_preds.reshape(-1)
        preds+=y_preds
   
        
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds 

In [None]:
mccnn_preds = get_res(xtrain,ytrain,xtest)

In [None]:
pred = 0.95*lgbm_pred + 0.05 * mccnn_preds


In [None]:
pred=pred.reshape(-1)
pred

In [None]:
submission = pd.DataFrame({'row_id':test.row_id,'target':pred})
submission.to_csv('submission.csv',index=False)

In [None]:
submission