In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
#test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
#trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
#book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
#print(train.head(2))
#print(test.head(2))
#print(book_train.head(2))
#print(trade_train.head(2))
#print(trade_test.head(2))

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# To calculate WAP and realized volatility per stock id and time id
def realized_volatility_per_time_id(file_path, prediction_column_name):
    print(file_path)
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized

In [None]:
import os
import glob
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_past_realized_test = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='test_past_vol')

In [None]:
df_past_realized_train.head()

In [None]:
df_past_realized_test.head()

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
#train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')
df_joined.columns = ['stock_id', 'time_id', 'target', 'row_id', 'past_vol']
df_joined.drop('row_id', axis=1, inplace=True)
#print(train.head())

In [None]:
df_joined.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
np.random.seed(0) # to select same sample everytime
df_train, df_test = train_test_split(df_joined, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.info()

In [None]:
scaler = MinMaxScaler()

In [None]:
#num_vars = ['stock_id', 'time_id', 'target', 'past_vol']
#df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
## Dividing into X & y sets for model building

y_train = df_train.pop('target')
X_train = df_train

In [None]:
## Fitting the model

lm = LinearRegression()
lm.fit(X_train, y_train)
#print(X_train.info())
#print(y_train.info())

In [None]:
rfe = RFE(lm, 3)
rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train_rfe = X_train[col]

In [None]:
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_rfe

In [None]:
## First model

# Adding a constant variable 
X_train_lm = sm.add_constant(X_train_rfe)

## Running the model
lm1 = sm.OLS(y_train, X_train_lm).fit()

In [None]:
lm1.summary()

In [None]:
y_train_pred = lm1.predict(X_train_lm)

In [None]:
X_train_lm

In [None]:
y_train_pred

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure()
sns.distplot((y_train - y_train_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)

In [None]:
#num_vars = ['stock_id', 'time_id', 'target', 'past_vol']
#df_test[num_vars] = scaler.fit_transform(df_test[num_vars])

In [None]:
y_test = df_test.pop('target')
X_test = df_test

In [None]:
X_test.head()

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
#X_test_new = X_test[X_train.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test)

In [None]:
X_test_new.head()

In [None]:
y_test_pred = lm1.predict(X_test_new)

In [None]:
# Plotting y_test and y_pred to understand the spread.
import matplotlib.pyplot as plt
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
fig.suptitle('y_test vs y_test_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_test_pred', fontsize=16)   

In [None]:
r2_score(y_true=y_test, y_pred=y_test_pred)

In [None]:
book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')
print(book_test)

In [None]:
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')
trade_test

In [None]:
print(df_past_realized_test)

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test

In [None]:
#test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
df_test_joined = test.merge(df_past_realized_test[['row_id','test_past_vol']], on = ['row_id'], how = 'left')
df_test_joined.columns = ['stock_id', 'time_id', 'row_id', 'past_vol']
df_test_joined.drop('row_id', axis=1, inplace=True)
print(df_test_joined.head())

In [None]:
#num_vars = ['stock_id', 'time_id', 'test_past_vol']
#df_test_joined[num_vars] = scaler.fit_transform(df_test_joined[num_vars])
df_test_joined

In [None]:
X_test_joined = df_test_joined

In [None]:
X_test_joined = sm.add_constant(df_test_joined)
#print(df_test_joined.shape)
print(X_test_joined)

In [None]:
X_test_joined.dropna(axis=0, inplace=True)
X_test_joined

In [None]:
#X_test_joined = sm.add_constant(X_test_joined)
y_test_joined_pred = lm1.predict(X_test_joined)

In [None]:
y_test_joined_pred

In [None]:
results = pd.concat([df_test_joined, y_test_joined_pred], axis=1)
results['row_id'] = results['stock_id'].astype(str) + '-' + results['time_id'].astype(str)
results.drop(['stock_id', 'time_id', 'past_vol'], axis=1, inplace=True)
results.columns = ['target', 'row_id']
results = results[['row_id', 'target']]
results

In [None]:
results.to_csv('submission.csv')