In [None]:
# Predict target for each time_id
# At the end, take the sum of the target values

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import glob
import re

In [None]:
# extract the second 10 minutes data from the train.csv
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
# extract first 10 minutes book & trade data

train_detail = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
train_trade = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
test_detail = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
test_trade = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')

In [None]:
booktp=pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=6')
booktp.head()

In [None]:
df=pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=6')
df.describe()

In [None]:
train_clean=pd.DataFrame(columns=['stock_id','time_id','wap_mean','wap_std','log_return_mean','log_return_std',
                                  'price_minmax','size_minmax','order_minmax','target'])

In [None]:
ex=train_detail[0:2]

In [None]:
# create feature value (wap,log_return,minmax of price etc...)

for path in ex:
    m = re.search('(\d+)',path)
    idtp=m.group() # stock id
    
    # read train data
    book=pd.read_parquet(path)
    
    # wap
    book['wap'] = (book['bid_price1'] * book['ask_size1'] +
                       book['ask_price1'] * book['bid_size1']) / (
                            book['bid_size1']+ book['ask_size1'])
    
    # log_return
    book.loc[:,'log_return'] = np.log(book['wap']).diff()
    book = book[~book['log_return'].isnull()]
    
    # pseudo momentum
    book['momentum']=(book['ask_price2'] - book['ask_price1'])-(book['bid_price1'] - book['bid_price2'])
    
    # read trading data
    trade_data=pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id='+idtp)
    price_median=trade_data['price'].median()
    size_median=trade_data['size'].median()
    order_count_median=trade_data['order_count'].median()
    
    
    # create data for each time_id
    time_id=book['time_id'].unique()
    for i in time_id:
   
        # calc values in book
        booktp=book[book['time_id']==i]
        wap_mean=booktp['wap'].mean()
        wap_std=booktp['wap'].std()
        log_return_mean=booktp['log_return'].mean()
        log_return_std=booktp['log_return'].std()
        
        # calc values in trade_data
        tradetp=trade_data[trade_data['time_id']==i]
        price_minmax=(tradetp['price'].max()-tradetp['price'].min())/price_median
        size_minmax=(tradetp['size'].max()-tradetp['size'].min())/size_median
        order_minmax=(tradetp['order_count'].max()-tradetp['order_count'].min())/order_count_median
        
        # extract target
        dftp=train[(train['time_id']==i) & (train['stock_id']==int(idtp))]
        target=dftp['target']
        
        # Data frame for binding
        df_ref=pd.DataFrame({'stock_id':int(idtp),'time_id':i,'wap_mean':wap_mean,'wap_std':wap_std,
                             'log_return_mean':log_return_mean,'log_return_std':log_return_std,
                             'price_minmax':price_minmax,'size_minmax':size_minmax,'order_minmax':order_minmax,'target':target})
        
        train_clean=train_clean.append(df_ref)
        
        pass
    pass

In [None]:
train_clean.head()

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(train_clean.corr(), vmax=1, vmin=-1, center=0,cmap='Blues')

In [None]:
X,y=train_clean.drop(columns=['stock_id','time_id','wap_std','size_minmax','target']),train_clean['target']

In [None]:
# Save statistical values for inverse conversion
Xmean,Xstd,ymean,ystd=X.mean(),X.std(),y.mean(),y.std()

In [None]:
# Standardization
X=(X - Xmean) / Xstd

In [None]:
X.head()

In [None]:
pg = sns.pairplot(X)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import ElasticNet
import lightgbm as lgb #LightGBM
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score 
from matplotlib import pyplot as plt

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
# Draw a scatter plot (true value vs. predicted value)
plt.plot(y_test, y_test, color = 'red', label = 'x=y') 
plt.scatter(y_test, y_pred)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('RMSE :',rmse)

# r2 : coefficient of determination
r2 = r2_score(y_test,y_pred)
print('R2 :',r2)

In [None]:
# Make submission CSV
# create feature value (wap,log_return,minmax of price etc...)

test_clean=pd.DataFrame(columns=['stock_id','time_id','wap_mean','wap_std','log_return_mean','log_return_std',
                                  'price_minmax','size_minmax','order_minmax'])

for path in test_detail:
    m = re.search('(\d+)',path)
    idtp=m.group() # stock id
    
    # read train data
    book=pd.read_parquet(path)
    
    # wap
    book['wap'] = (book['bid_price1'] * book['ask_size1'] +
                       book['ask_price1'] * book['bid_size1']) / (
                            book['bid_size1']+ book['ask_size1'])
    
    # log_return
    book.loc[:,'log_return'] = np.log(book['wap']).diff()
    book = book[~book['log_return'].isnull()]
    
    # pseudo momentum
    book['momentum']=(book['ask_price2'] - book['ask_price1'])-(book['bid_price1'] - book['bid_price2'])
    
    # read trading data
    trade_data=pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id='+idtp)
    price_median=trade_data['price'].median()
    size_median=trade_data['size'].median()
    order_count_median=trade_data['order_count'].median()
    
    
    # create data for each time_id
    time_id=book['time_id'].unique()
    for i in time_id:
        
        print(i)
        
        # calc values in book
        booktp=book[book['time_id']==i]
        wap_mean=booktp['wap'].mean()
        wap_std=booktp['wap'].std()
        log_return_mean=booktp['log_return'].mean()
        log_return_std=booktp['log_return'].std()
        
        # calc values in trade_data
        tradetp=trade_data[trade_data['time_id']==i]
        price_minmax=(tradetp['price'].max()-tradetp['price'].min())/price_median
        size_minmax=(tradetp['size'].max()-tradetp['size'].min())/size_median
        order_minmax=(tradetp['order_count'].max()-tradetp['order_count'].min())/order_count_median
        
        # extract target
        dftp=train[(train['time_id']==i) & (train['stock_id']==int(idtp))]
        target=dftp['target']
        
        print(int(idtp),i,wap_mean,wap_std,log_return_mean,log_return_std,price_minmax,size_minmax,order_minmax)
        
        # Data frame for binding
        df_ref=pd.DataFrame({'stock_id':[int(idtp)],'time_id':i,'wap_mean':wap_mean,'wap_std':wap_std,
                             'log_return_mean':log_return_mean,'log_return_std':log_return_std,
                             'price_minmax':price_minmax,'size_minmax':size_minmax,'order_minmax':order_minmax})
        
        test_clean=test_clean.append(df_ref)
        
        pass
    pass

In [None]:
X=test_clean.drop(columns=['stock_id','time_id','wap_std','size_minmax'])
X.head()

In [None]:
# Standardization
X=(X - Xmean) / Xstd

In [None]:
y = model.predict(X)

In [None]:
# submission file

dic={'row_id':[],'target':[]}
for i in range(len(test_clean)):
    id1=test_clean['stock_id'][0] 
    id2=test_clean['time_id'][0] 
    dic['row_id'].append(str(id1)+'-'+str(id2)) 
    dic['target'].append(y[0]) 
    pass

submission=pd.DataFrame(dic) 
submission.to_csv('submission.csv', index=False)