In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import glob
import tqdm as tqdm
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor
from sklearn import model_selection
from sklearn.model_selection import KFold
import xgboost as xgb

In [None]:
# For quickly switching between training and test data
def train_test(mode):
    # mode = "train"/"test"
    file_name = '../input/optiver-realized-volatility-prediction/' + mode + '.csv'
    return pd.read_csv(file_name)

def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def rmspe(y_true, y_pred):  
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False

def is_in(stock_groups, stock_id):
    for i in range(len(stock_groups)):
        if stock_id in stock_groups[i]:
            return i
    
# custom aggregate function
def wap2vol(df):
    # wap2vol stands for WAP to Realized Volatility
    temp = np.log(df).diff() # calculating tik to tik returns
    # returning realized volatility
    return np.sqrt(np.sum(temp**2)) 

def rv(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def agg(df_book, stock_stat, feature, func, new_name = None, rename = False):
    if rename:
        stock_stat = pd.merge( df_book.groupby(by = ['time_id'])[feature].agg(func).reset_index().rename(columns = {feature : new_name}),
        stock_stat, on = ['time_id'], how = 'left')    
    else:
        stock_stat = pd.merge( df_book.groupby(by = ['time_id'])[feature].agg(func).reset_index(),
            stock_stat, on = ['time_id'], how = 'left')     
    return stock_stat

# for inference
def linear_inference(models, stock_id, stock_groups, volatility_features, degree, grouped = True):
    if grouped:
        model = models[is_in(stock_groups, stock_id)]
    else:
        model = models[stock_id]
    polyfeat = PolynomialFeatures(degree = degree)
    return model.predict(polyfeat.fit_transform([volatility_features]))[0]

In [None]:
train = train_test("train")
train.head()

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
def trade_stock_stat(path, stock_id):
    trade =  pd.read_parquet(path)
    trade['stock_id'] = stock_id
    trade['trade_log_return'] = trade.groupby(['time_id'])['price'].apply(log_return).fillna(0)
    trade_features = ["price", "size", "order_count", "trade_log_return"]
  #  print("Trade", trade)
    for feature in trade_features:
        if feature == "price":
            stock_stat = agg(trade, trade, feature, func = "mean", rename = True, new_name = feature + "_mean")
        else:
            stock_stat = agg(trade, stock_stat, feature, func = "mean", rename = True, new_name = feature + "_mean")
        stock_stat = agg(trade, stock_stat, feature, func = max, rename = True, new_name = feature + "_max")
        stock_stat = agg(trade, stock_stat, feature, func = min, rename = True, new_name = feature + "_min")
        stock_stat = agg(trade, stock_stat, feature, func = sum, rename = True, new_name = feature + "_sum")
#         print(stock_stat)
        if feature == "trade_log_return":
            stock_stat = agg(trade, stock_stat, feature, func = rv, rename = True, new_name = feature + "_rv")
   #     print("End df", stock_stat)
   # print("Final df", stock_stat)    
    return stock_stat

def get_trade_overall(book):
    total_df = pd.DataFrame()
    for i in tqdm.tqdm(book):
        temp_stock = int(i.split("=")[1])
        temp_relvol = trade_stock_stat(path = i, stock_id = temp_stock)
#         print(temp_relvol)
#         print(temp_relvol.columns)
        total_df = pd.concat([total_df, temp_relvol])
    return total_df

In [None]:
def get_stock_stat(path, stock_id):
    df_book = pd.read_parquet(path) # order book for a stock id loaded
    # compute different vwap
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
                            df_book['bid_size1']+ df_book['ask_size1'])

    # wap2
    a = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
    b = df_book['bid_size2']+ df_book['ask_size2']
    df_book['wap2'] = a/b
    
    # wap3
    a1 = df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']
    a2 = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
    b = df_book['bid_size1'] + df_book['ask_size1'] + df_book['bid_size2']+ df_book['ask_size2']    
    df_book['wap3'] = (a1 + a2)/ b
    
    # wap4 
    a = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
                                       df_book['bid_size1']+ df_book['ask_size1'])
    b = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) / (
                                       df_book['bid_size2']+ df_book['ask_size2'])
    df_book['wap4'] = (a + b) / 2
                    
    df_book['vol_wap1'] = (df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap2'] = (df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap3'] = (df_book.groupby(by = ['time_id'])['wap3'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap4'] = (df_book.groupby(by = ['time_id'])['wap4'].apply(log_return).reset_index(drop = True).fillna(0))
                
        
    df_book['bas'] = (df_book[['ask_price1', 'ask_price2']].min(axis = 1)
                                / df_book[['bid_price1', 'bid_price2']].max(axis = 1) - 1)   
    

    # different spreads
    df_book['h_spread_l1'] = df_book['ask_price1'] - df_book['bid_price1']
    df_book['h_spread_l2'] = df_book['ask_price2'] - df_book['bid_price2']
    df_book['v_spread_b'] = df_book['bid_price1'] - df_book['bid_price2']
    df_book['v_spread_a'] = df_book['ask_price1'] - df_book['ask_price2']
    df_book['spread_dif1'] = df_book['ask_price1'] - df_book['bid_price2']
    df_book['spread_dif2'] = df_book['ask_price2'] - df_book['bid_price1']
    
    # attach volatitilies based on different VWAPs
    stock_stat = pd.merge(
        df_book.groupby(by = ['time_id'])['vol_wap1'].agg(rv).reset_index(),
        df_book.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'], how = 'left'
    )
    
    vol_features = ["vol_wap2", "vol_wap3", "vol_wap4"]
    spread_features = ["h_spread_l1", 'h_spread_l2', 'v_spread_b', 'v_spread_a', "spread_dif1", "spread_dif2"]
    time_features = ["seconds_in_bucket"]
    for feature in vol_features:
         stock_stat = agg(df_book, stock_stat, feature, rv)
            
    for feature in spread_features:
        stock_stat = agg(df_book, stock_stat, feature, func = max, rename = True, new_name = feature + "_max")
        stock_stat = agg(df_book, stock_stat, feature, func = min, rename = True, new_name = feature + "_min")
        stock_stat = agg(df_book, stock_stat, feature, func = sum, rename = True, new_name = feature + "_sum")
        stock_stat = agg(df_book, stock_stat, feature, func = "mean", rename = True, new_name = feature + "_mean")
    
#     for feature in time_features:
#         stock_stat = agg(df_book, stock_stat, feature, func = max, rename = True, new_name = feature + "_max")
#         stock_stat = agg(df_book, stock_stat, feature, func = min, rename = True, new_name = feature + "_min")
#         stock_stat = agg(df_book, stock_stat, feature, func = sum, rename = True, new_name = feature + "_sum")
#         stock_stat = agg(df_book, stock_stat, feature, func = "mean", rename = True, new_name = feature + "_mean")
    
    
    stock_stat['stock_id'] = stock_id
    #print(stock_stat[["seconds_in_bucket_max", "seconds_in_bucket_min", "seconds_in_bucket_sum", "seconds_in_bucket_mean"]])
    #stock_stat = pd.merge(stock_stat, df_book.groupby("time_id")["seconds_in_bucket"].diff().fillna(0).mean().rename(columns = {"index" : "time_id", "seconds_in_bucket" : "sib_diff"}), on = ["time_id"], how = "left")

    return stock_stat

In [None]:
def get_overall(book):
    total_df = pd.DataFrame()
    for i in tqdm.tqdm(book):
        temp_stock = int(i.split("=")[1])
        temp_relvol = get_stock_stat(i, temp_stock)
    #    print(temp_relvol)
        total_df = pd.concat([total_df, temp_relvol])
    return total_df

def linear_training(X,y,degree):
    # instantiating polynomial features
    polyfeat = PolynomialFeatures(degree = degree)
    weights = 1/np.square(y)
   # linreg = linear_model.LinearRegression()
    linreg = XGBRegressor(eval_metric = rmspe, sample_weight = weights)
    # preprocessing the training data
    x = np.array(X)
    x = np.array(X).reshape(-1,len(x[0]))
    # x = np.array(X).reshape(-1,1)
    # creating the polynomial features
    X_ = polyfeat.fit_transform(x)
    # training the model

    return clf.fit(X_, np.array(y).reshape(-1,1), sample_weight = weights)

def transform(X, polyfeat):
    x = np.array(X)
    x = np.array(X).reshape(-1,len(x[0]))
    X = polyfeat.fit_transform(x)
    return X

def Xgboost(X, y, degree = 1, params = {"reg_alpha" : 20, "reg_lambda" : 20, "max_depth" : 5, "n_estimators" : 500}, folds=10):
    polyfeat = PolynomialFeatures(degree = degree)
    skf = KFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X)):
        
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]
        
        x_tr = transform(x_tr, polyfeat)
        x_ts = transform(x_ts, polyfeat)
 #       print("Y tr", y_tr)
  #      print("Xts", x_ts)
  #      print("y ts", y_ts)
        weights = np.array(1/np.square(y_tr))
        eval_weights = np.array(1/np.square(y_ts))
  #      print("Weights", eval_weights)
      #  print("xtr", x_tr)
        model = XGBRegressor(**params)
        model.fit(x_tr, y_tr,
                 eval_set=[(x_ts, y_ts)],
                 early_stopping_rounds=10, sample_weight = weights, sample_weight_eval_set = [eval_weights],
                  verbose=False)   
     #   print("Evaluation result", model.evals_result())
              
    return model

#Groups stocks
def chunks(L, n): return [L[x: x+n] for x in range(0, len(L), n)]

def train_score(joined, models, stock_groups, feature_list, degree = 1):
    train_pred = create_submission(joined, joined, models, stock_groups, feature_list, degree = degree, merge = True)
    rmspe_train = rmspe(np.array(train_pred["target_x"]), np.array(train_pred["target_y"]))
    return rmspe_train

# creating the header for the submission file
def create_submission(test_df, joined, models, stock_groups, feature_list, degree = 1, merge = False):
    submission = pd.DataFrame({"row_id" : [], "target" : []})  
    submission["row_id"] = test_df.apply(lambda x: str(int(x.stock_id)) + '-' + str(int(x.time_id)), axis=1)
    submission["target"] = test_df.apply(lambda x: linear_inference(models,\
                                                                            x.stock_id,\
                                                                            stock_groups, \
                                                                            list(x[feature_list]),\
                                                                            degree),\
                                                 axis = 1)
    
    if merge:
        submission["stock_id"] = test_df.apply(lambda x: int(x.stock_id), axis = 1)
        submission["time_id"] = test_df.apply(lambda x: int(x.time_id), axis=1)
        overall = joined.merge(submission, on = ["stock_id", "time_id"], how = "left")
        return overall
    else:
        return submission
    
def get_model(train, joined, feature_list, group_number = 5, group = True, degree = 1, xgboost = False):
    stock_groups = chunks(list(joined.groupby("stock_id").mean()["target"].reset_index().sort_values(by = "target")["stock_id"]), group_number)
    stock_id_train = train.stock_id.unique() # all stock_id for the train set
    models = {} # dictionary for holding trained models for each stock_id
    if not group:
        for i in tqdm.tqdm(stock_id_train):
            temp = joined[joined["stock_id"]==i]
            X = temp[feature_list]
            y = temp["target"]
            if xgboost:
                models[i] = Xgboost(X, y, degree = degree, folds=5)
           #     xgb.plot_importance(models[i], title = "Feature importance for stock id {}:".format(i))
            else:
                models[i] = linear_training(X,y,degree)
    else:
        for i in tqdm.tqdm(range(len(stock_groups))):
            temp = joined[joined["stock_id"].isin(stock_groups[i])]
            X = temp[feature_list]
            y = temp["target"]
            if xgboost:
                models[i] = Xgboost(X, y, degree = degree, folds=5)
         #       xgb.plot_importance(models[i], title = "Feature importance for stock ids {}:".format("".join(str(stock_groups[i]))))
            else:
                models[i] = linear_training(X,y,degree)
    return models

def merge_frames(train = True):
    if train:
        order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
        total_df = get_overall(order_book_training)
        trade_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
        total_trade = get_trade_overall(trade_training).drop(columns = ["seconds_in_bucket", "price", "size", "order_count", "trade_log_return"]).drop_duplicates()
        total_df = total_df.merge(total_trade, on = ["stock_id","time_id"], how = "left")
    else:
        # listing all test order books
        order_book_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
        test_df = get_overall(order_book_test)
        trade_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')
        total_test = get_trade_overall(trade_test).drop(columns = ["seconds_in_bucket", "price", "size", "order_count", "trade_log_return"]).drop_duplicates()
        total_df = test_df.merge(total_test, on = ["stock_id","time_id"], how = "left")
    return total_df

def combined(train, save = True, read = False):
    total_df = merge_frames()
    if read:
        joined = pd.read_csv("../input/weighted-regression-with-stock-grouping-and-eda/Joined1.csv")
    else:    
        joined = train.merge(total_df, on = ["stock_id","time_id"], how = "left")
    
    if save:
        joined.to_csv("Joined1.csv", index = False)

    feature_list = list(joined.drop(columns = ["stock_id", "time_id", "target"]).columns)
    return feature_list, joined


In [None]:
feature_list, joined = combined(train, save = True)
feature_list

In [None]:
joined.head(5)

In [None]:
fig, ax = plt.subplots(figsize=(25,25))
sns.heatmap(joined.groupby(by = "stock_id").mean()[feature_list + ["target"]].corr(), center = 0, annot = True, cmap="YlGnBu", linewidths = .05)
plt.title("Correlation heatmap")
joined.groupby("stock_id").mean()["target"].reset_index().sort_values(by = "target").plot(x = "stock_id", y = "target", kind = "bar", figsize = (20, 8), title = "Mean target for each stock sorted")


In [None]:
test_df = merge_frames(train = False)
test_df

In [None]:
joined[joined["order_count_max"].isnull()]

In [None]:
def fill_nan(joined):
    nan_features = list(joined.columns)[32:]
    missing_stock_ids = list(joined[joined["trade_log_return_rv"].isnull()]["stock_id"].unique())
    for stock_id in missing_stock_ids:
        for feature in nan_features:
            if joined.loc[joined["stock_id"] == stock_id, feature].isnull().all():
                joined.loc[joined["stock_id"] == stock_id, feature] = joined.loc[joined["stock_id"] == stock_id, feature].fillna(joined[feature].mean())
            else:
                joined.loc[joined["stock_id"] == stock_id, feature] = joined.loc[joined["stock_id"] == stock_id, feature].fillna(joined.loc[joined["stock_id"] == stock_id, feature].mean())
    return joined
joined = fill_nan(joined)
print("Missing count after filling", joined.isnull().any().sum())

In [None]:
joined[joined["size_sum"].isnull()]

In [None]:
#Looping over grouping and the polynomial features degree for best train score
group_numbers = [1]
degrees = [1]
scores = {}
validate = False
for group_number in group_numbers:
    for degree in degrees:
        stock_groups = chunks(list(joined.groupby("stock_id").mean()["target"].reset_index().sort_values(by = "target")["stock_id"]), group_number)
        models = get_model(train, joined, feature_list, group_number = group_number, group = True, degree = degree, xgboost = True)
        if validate:
            overall_train_score = train_score(joined, models, stock_groups, feature_list, degree = degree)
            print("Train score for group number {} and degree {}:".format(group_number, degree), overall_train_score)
            scores[str(group_number) + " - " + str(degree)] = overall_train_score

In [None]:
submission = create_submission(test_df, joined, models, stock_groups, feature_list, degree = 1, merge = False)
submission.to_csv("submission.csv", index = False)
submission