In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import glob
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
df_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
df_test.head()

In [None]:
submission = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')
submission.head()

In [None]:
def read_data(path):
    trade = pd.read_parquet(path)
    return trade


def consol_book_df(path):

    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1]) #extract stock id by removing directory
    
    # Caulculating WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #calculating log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #calculating spread
    # As explained in the dataset description the difference between bid value and ask value i.e. spread is correlated to volatile nature of stock
    # the bigger the spread the higher volatile stock will be
    
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['bid_ask_price_ratio1'] = df['bid_price1'] / df['ask_price1']
    df['bid_ask_price_ratio2'] = df['bid_price2'] / df['ask_price2']
    
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #Book features
    
    '''the features that will be returned from book_data are:
        1.Realized volatiltiy1:calculated from WAP1
        2.Realized volatility2: calculated from WAP2
        3.Price_spread1: The spread betwwen ask_price1 and bid_price1
        4.Price_spread2: The spread betwwen ask_price2 and bid_price2
        5.Bid_spread: The spread between the two bidding prices
        6.Ask_spread: The spread between the two ask prices
    '''
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              price_spread1 =('price_spread1', 'mean'),
                                              price_spread2 =('price_spread2', 'mean'),
                                              bid_spread =('bid_spread', 'mean'),
                                              ask_spread =('ask_spread', 'mean'),
                                              bid_ask_price_ratio1 =('bid_ask_price_ratio1', 'mean'),
                                              bid_ask_price_ratio2 =('bid_ask_price_ratio2', 'mean'),
                                              total_volume =('total_volume', 'sum'),
                                              volume_imbalance =('volume_imbalance', 'mean'),
    
        
                                              
                                              ).reset_index()
    return final_book



# consol_trade_df works on trade_train data
#It return realized volatility calculated from the price column of trada data

def consol_trade_df(path):
    
    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1])  #extract stock id by removing directory
    
    #trade log return from fixed price in trade book
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)

    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade

In [None]:
def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

In [None]:
def create_dataSet(df, book_paths, trade_paths):
    final_df = pd.DataFrame()
    for book_path, trade_path in tqdm(zip(book_paths, trade_paths)):
        book = consol_book_df(book_path)
        trade = consol_trade_df(trade_path)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df


In [None]:
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10,6)

In [None]:
df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test= pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
test.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
test.shape

In [None]:
df.isnull().sum()

In [None]:
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10,6)

In [None]:
plt.hist(df.target, bins=20, rwidth=0.8)
plt.xlabel('target')
plt.ylabel('Count')
plt.show()

In [None]:
from scipy.stats import norm
import numpy as np
plt.hist(df.target, bins=20, rwidth=0.8, density=True)
plt.xlabel('target')
plt.ylabel('Count')

rng = np.arange(df.target.min(), df.target.max(), 0.1)
plt.plot(rng, norm.pdf(rng,df.target.mean(),df.target.std()))

In [None]:
upper_limit = df.target.mean() + 4*df.target.std()
upper_limit

In [None]:
lower_limit = df.target.mean() -4*df.target.std()
lower_limit

In [None]:
df[(df.target>upper_limit) | (df.target<lower_limit)]

In [None]:
df2 = df[(df.target<upper_limit) & (df.target>lower_limit)]
df2.head()

In [None]:
df2.shape

In [None]:
Q1 = df2.target.quantile(0.45)
Q3 = df2.target.quantile(0.55)
Q1, Q3

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

In [None]:
df2[(df2.target<lower_limit)|(df2.target>upper_limit)]

In [None]:
df3 = df2[(df2.target>lower_limit)&(df2.target<upper_limit)]
df3