In [None]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import joblib


import numpy as np
import matplotlib.pyplot as plt
import os
import glob

In [None]:
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def log_return(stock_price):
    """
    stock price: a series of stock price from groupby function
    """
    return np.log(stock_price).diff()

def realized_vol(stock_log_return):
    """
    log_return: the return of function log_return
    return: the vol of ten mins
    """
    return np.sqrt(np.sum(stock_log_return[1:]**2))

def realized_vol_seconds(stock_log_return,seconds):
    """
    seconds: parameter that controls the target mins
    Designed for feature engineering, the volatility of the last several mins
    """
    return np.sqrt(np.sum(stock_log_return[stock_log_return.seconds_in_bucket >= seconds]['log_return']**2))

def book_train_feature(book_train_0):
    book_train_0['wap1'] = (book_train_0['bid_price1'] * book_train_0['ask_size1']+book_train_0['ask_price1']*book_train_0['bid_size1'])/(book_train_0['bid_size1']+book_train_0['ask_size1'])
    book_train_0['wap2'] = (book_train_0['bid_price2'] * book_train_0['ask_size2']+book_train_0['ask_price2']*book_train_0['bid_size2'])/(book_train_0['bid_size2']+book_train_0['ask_size2'])
    book_train_0['wap_spread'] = book_train_0['wap1'] - book_train_0['wap2']

    book_train_0['bidask_spread'] = (book_train_0['ask_price1'] - book_train_0['bid_price1']) / book_train_0['bid_price1']
    book_train_0['bidask_spread2'] = (book_train_0['ask_price2'] - book_train_0['bid_price2']) / book_train_0['bid_price2']
    # wap在bid和ask之间的范围
    book_train_0['wap_bid_ask_position'] = abs(abs((book_train_0['ask_price2']+book_train_0['ask_price1'])/2-book_train_0['wap1']) - abs((book_train_0['bid_price2']+book_train_0['bid_price1'])/2-book_train_0['wap1']))

    book_train_0['total_ask_size'] = book_train_0['ask_size1'] + book_train_0['ask_size2']
    book_train_0['total_bid_size'] = book_train_0['bid_size1'] + book_train_0['bid_size2']
    book_train_0['total_ask_amount'] = book_train_0['ask_price1'] * book_train_0['ask_size1'] + book_train_0['ask_price2'] * book_train_0['ask_size2']
    book_train_0['total_bid_amount'] = book_train_0['bid_price1'] * book_train_0['bid_size1'] + book_train_0['bid_price2'] * book_train_0['bid_size2']
    book_train_0['size_imbalance'] = book_train_0['total_ask_size'] / book_train_0['total_bid_size'] - 1
    book_train_0['amount_imbalance'] = book_train_0['total_ask_amount'] / book_train_0['total_bid_amount'] - 1
    book_train_0['log_return'] = book_train_0.groupby('time_id')['wap1'].apply(log_return).fillna(0)
    book_train_0['log_return_wap2'] = book_train_0.groupby('time_id')['wap2'].apply(log_return).fillna(0)

    # volatility feature
    list_vol_feature_book = []
    book_train_0_groupby_timeid = book_train_0.groupby('time_id')
    for time_id, book_train_time_id in book_train_0_groupby_timeid:
        realized_vol = realized_vol_seconds(book_train_time_id,seconds=0)
        realized_vol_seconds_300 = realized_vol_seconds(book_train_time_id,seconds=300)
        realized_vol_seconds_480 = realized_vol_seconds(book_train_time_id,seconds=480) 
        realized_vol_seconds_540 = realized_vol_seconds(book_train_time_id,seconds=540)

        book_train_time_id_480 = book_train_time_id[book_train_time_id.seconds_in_bucket>=480]
        book_train_time_id_540 = book_train_time_id[book_train_time_id.seconds_in_bucket>=540]

        bidask_spread_range = book_train_time_id['bidask_spread'].quantile(0.75)- book_train_time_id['bidask_spread'].quantile(0.25)
        bidask_spread_cv = book_train_time_id['bidask_spread'].std()/book_train_time_id['bidask_spread'].mean()
        bidask_spread_range2 = book_train_time_id['bidask_spread2'].quantile(0.75)- book_train_time_id['bidask_spread'].quantile(0.25)

        size_imbalance_range = book_train_time_id['size_imbalance'].quantile(0.75) - book_train_time_id['size_imbalance'].quantile(0.25)
        amount_imbalance_range = book_train_time_id['amount_imbalance'].quantile(0.75) - book_train_time_id['amount_imbalance'].quantile(0.25)
        wap_range = book_train_time_id['wap1'].max()/book_train_time_id['wap1'].min()
        wap_bid_ask_imbalance = book_train_time_id['wap_bid_ask_position'].sum()
        wap_bid_ask_imbalance_last_480 =  book_train_time_id_480['wap_bid_ask_position'].sum()
        wap_bid_ask_imbalance_last_540 =  book_train_time_id_540['wap_bid_ask_position'].sum()
        wap_bid_ask_imbalance_range = book_train_time_id['wap_bid_ask_position'].quantile(0.75) -book_train_time_id['wap_bid_ask_position'].quantile(0.25) 

        list_vol_feature_book.append([time_id,realized_vol,realized_vol_seconds_300,realized_vol_seconds_480,realized_vol_seconds_540,bidask_spread_range,bidask_spread_range2,wap_bid_ask_imbalance_range,
        bidask_spread_cv,size_imbalance_range,amount_imbalance_range,wap_range,wap_bid_ask_imbalance,wap_bid_ask_imbalance_last_480,wap_bid_ask_imbalance_last_540])

    volatility_feature_book = pd.DataFrame(list_vol_feature_book,columns=['time_id','realized_vol','realized_vol_seconds_300','realized_vol_seconds_480','realized_vol_seconds_540','bidask_spread_range','bidask_spread_range2','wap_bid_ask_imbalance_range',
        'bidask_spread_cv','size_imbalance_range','amount_imbalance_range','wap_range','wap_bid_ask_imbalance','wap_bid_ask_imbalance_last_480','wap_bid_ask_imbalance_last_540'])
    return volatility_feature_book

In [None]:
def price_cv_seconds(stock_series,seconds):
    return stock_series[stock_series.seconds_in_bucket >= seconds]

def trade_train_feature(trade_train_0):
    trade_train_0['amount'] = trade_train_0['size'] * trade_train_0['price']
    trade_train_0['avg_order_amount'] = trade_train_0['amount'] / trade_train_0['order_count']


    trade_train_groupby_timeid = trade_train_0.groupby('time_id')
    # feature extraction in trade_train
    list_vol_feature_trade = []
    for time_id,trade_train_time_id in trade_train_groupby_timeid:

        trade_train_time_id_300 = price_cv_seconds(trade_train_time_id,300)
        trade_train_time_id_480 = price_cv_seconds(trade_train_time_id,480)
        trade_train_time_id_540 = price_cv_seconds(trade_train_time_id,540)
        
        total_trade_order = trade_train_time_id['order_count'].sum()
        total_trade_order_300 = trade_train_time_id_300['order_count'].sum()
        total_trade_order_480 = trade_train_time_id_480['order_count'].sum()
        total_trade_order_540 = trade_train_time_id_540['order_count'].sum()

        price_range = trade_train_time_id['price'].max()/trade_train_time_id['price'].min()
        price_std = trade_train_time_id['price'].std()
        price_std_300 = trade_train_time_id_300['price'].std()
        price_std_480 = trade_train_time_id_480['price'].std()
        price_std_540 = trade_train_time_id_540['price'].std()

        price_cv = trade_train_time_id['price'].std()/trade_train_time_id['price'].mean()
        price_cv_300 = trade_train_time_id_300['price'].std()/trade_train_time_id_300['price'].mean()
        price_cv_480 = trade_train_time_id_480['price'].std()/trade_train_time_id_480['price'].mean()
        price_cv_540 = trade_train_time_id_540['price'].std()/trade_train_time_id_540['price'].mean()

        size_cv = trade_train_time_id['size'].std()/trade_train_time_id['size'].mean()
        size_cv_300 = trade_train_time_id_300['size'].std()/trade_train_time_id_300['size'].mean()
        size_cv_540 = trade_train_time_id_540['size'].std()/trade_train_time_id_540['size'].mean()

        list_vol_feature_trade.append([time_id,total_trade_order, total_trade_order_300,total_trade_order_480,total_trade_order_540,price_range,price_std,price_std_480,price_std_540,price_std_300,
        price_cv,price_cv_300,price_cv_480,price_cv_540,size_cv,size_cv_300,size_cv_540])

    volatility_feature_trade = pd.DataFrame(list_vol_feature_trade,columns=['time_id','total_trade_order', 'total_trade_order_300','total_trade_order_480','total_trade_order_540','price_range','price_std','price_std_480','price_std_540','price_std_300',
    'price_cv','price_cv_300','price_cv_480','price_cv_540','size_cv','size_cv_300','size_cv_54'])
    return volatility_feature_trade

In [None]:
# data_dir = '../input/optiver-realized-volatility-prediction/book_train.parquet'
# stock_list = sorted([int(_.split('=')[1]) for _ in os.listdir(data_dir)])
# target_df = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
# train_df = pd.DataFrame()
# for stock_id in stock_list:
#     print(stock_id)
#     # 对两个数据集生成特征，每一个id对应的target进行合成
#     book_train_0 = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/'+f'stock_id={stock_id}')
#     trade_train_0 = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/'+f'stock_id={stock_id}')
#     target_df_stock_0 = target_df[target_df.stock_id==stock_id][['time_id','target']]
#     # 合并特征
#     volatility_feature = trade_train_feature(trade_train_0).merge(book_train_feature(book_train_0),on='time_id')
#     volatility_feature = volatility_feature.fillna(method = 'ffill')
#     volatility_feature = volatility_feature.fillna(0)    
# #     volatility_feature_1 = pd.DataFrame(StandardScaler().fit_transform(volatility_feature.iloc[:,1:]),columns=volatility_feature.iloc[:,1:].columns)
# #     volatility_feature_1 = 
#     volatility_feature['stock_id'] = stock_id
# #     volatility_feature_1['time_id'] = volatility_feature['time_id']
#     train_feature_temp = volatility_feature.merge(target_df_stock_0,on='time_id')
#     # 把所有id的特征合并
#     train_df = pd.concat([train_df,train_feature_temp])
# # train_df = pd.read_csv('../input/train-data/train_df.csv')
# train_df.head()

In [None]:
train_df = pd.read_csv('../input/traindata/train_df_without_normalize.csv')
train_df.head()

In [None]:
# classification 
norm_feature_groupby = train_df.groupby('stock_id')
information_stock = []
for stock_id,norm_feature_temp in norm_feature_groupby:
    avg_target_vol = norm_feature_temp['target'].mean()
    iqr_target = norm_feature_temp['target'].quantile(0.75) - norm_feature_temp['target'].quantile(0.25)
    range_target = norm_feature_temp['target'].max() - norm_feature_temp['target'].min()
    information_stock.append([stock_id,avg_target_vol,iqr_target,range_target])
information_stock_df = pd.DataFrame(information_stock,columns=['stock_id','avg_target_vol','iqr_target','range_target'])
estimator = KMeans(n_clusters = 10)
estimator.fit(information_stock_df.iloc[:,1:])    
y_pred = estimator.predict(information_stock_df.iloc[:,1:])
information_stock_df['classification'] = y_pred
check_dict = dict(zip(information_stock_df['stock_id'],information_stock_df['classification']))

In [None]:
# every stock has its own characters, 
# this is the k-means result based on the target features in kaggle_train_feature.ipynb
  
train_df['classification'] = list(train_df['stock_id'].map(check_dict))
train_df_train = train_df.copy().drop(['time_id','stock_id','target'],1)

In [None]:
train_df_groupby_class = train_df.groupby('classification')
model_list = []
for class_,df_ in train_df_groupby_class:
    print(len(df_))
    test_y = np.array(df_['target'])
    test_x = np.array(df_.copy().drop(['time_id','stock_id','target','classification'],1))
#     rf = RandomForestRegressor(n_estimators=30,max_depth=30,min_samples_leaf=2,max_features = 30 ,max_samples = 0.2)
    if len(df_)>10000:
        rf = RandomForestRegressor(n_estimators=10,max_depth=30,min_samples_leaf=5,max_features = 20)
    else:
        rf = RandomForestRegressor(n_estimators=5,max_depth=30,min_samples_leaf=1,max_features = 15)
    model = rf.fit(test_x,test_y)
    model_list.append([class_,model])
model_dict = dict(model_list)

In [None]:
train_df_groupby_class = train_df.groupby('classification')
model_list = []
result_pred = pd.Series()
result_pred_real = pd.Series()
for class_,df_ in train_df_groupby_class:
    test_y = np.array(df_['target'])
    test_x = np.array(df_.copy().drop(['time_id','stock_id','target','classification'],1))
    pred_train_y = model_dict[class_].predict(test_x)
    df_['pred'] = pred_train_y
    result_pred = pd.concat([result_pred,df_['pred']])
    result_pred_real = pd.concat([result_pred_real,df_['target']])
    
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
# pred_train_y = model.predict(np.array(train_df_train))
pred_train_y = np.array(result_pred)
print(pred_train_y)
R2 = round(r2_score(y_true =result_pred_real, y_pred =pred_train_y),3)
RMSPE = round(rmspe(y_true = result_pred_real, y_pred =pred_train_y),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
#random forest model, parameters are chosen from grid_serch & feature are chosen from RFECV
# rf2=RandomForestRegressor(n_estimators=100,max_depth=30,min_samples_leaf=2,max_features = 30 ,max_samples = 0.6)
# test_y = np.array(train_df['target'])
# test_x = np.array(train_df_train)
# model = rf2.fit(test_x,test_y)

In [None]:
# from sklearn.metrics import r2_score
# def rmspe(y_true, y_pred):
#     return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
# # pred_train_y = model.predict(np.array(train_df_train))

# R2 = round(r2_score(y_true = train_df['target'], y_pred =pred_train_y),3)
# RMSPE = round(rmspe(y_true = train_df['target'], y_pred =pred_train_y),3)
# print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
def model_socre(max_sample,n_estimators,min_samples_leaf):
    rf2=RandomForestRegressor(n_estimators=n_estimators,max_depth=30,min_samples_leaf=min_samples_leaf,max_features = 30 ,max_samples = max_sample)
    test_y = np.array(train_df['target'])
    test_x = np.array(train_df_train)
    model = rf2.fit(test_x,test_y)
    pred_train_y = model.predict(np.array(train_df_train))
    R2 = round(r2_score(y_true = train_df['target'], y_pred =pred_train_y),3)
    RMSPE = round(rmspe(y_true = train_df['target'], y_pred =pred_train_y),3)
    print(f'Performance of the <{max_sample}{n_estimators}{min_samples_leaf}> prediction: R2 score: {R2}, RMSPE: {RMSPE}')
    

In [None]:
#测试集数据
data_dir_test = '../input/optiver-realized-volatility-prediction/book_test.parquet'
stock_list_test = sorted([int(_.split('=')[1]) for _ in os.listdir(data_dir_test)])
train_df_test = pd.DataFrame()
for stock_id in stock_list_test:
    book_test_0 = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet/'+f'stock_id={stock_id}')
    trade_test_0 = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet/'+f'stock_id={stock_id}')
    volatility_feature = trade_train_feature(trade_test_0).merge(book_train_feature(book_test_0),on='time_id')
    volatility_feature = volatility_feature.fillna(method='ffill')
    volatility_feature = volatility_feature.fillna(0)
    
#     volatility_feature_1 = pd.DataFrame(StandardScaler().fit_transform(volatility_feature.iloc[:,1:]),columns=volatility_feature.iloc[:,1:].columns)
    volatility_feature['stock_id'] = stock_id
    train_df_test = pd.concat([train_df_test,volatility_feature])

train_df_test['classification'] = list(train_df_test['stock_id'].map(check_dict))
train_df_test_1 = train_df_test.copy().drop(['time_id','stock_id'],1)

In [None]:
# # 测试集预测
# # print('finished successfully')
# pred_test_y = model.predict(np.array(train_df_test_1))

# train_df_test['target'] = pred_test_y
# train_df_test['row_id'] = train_df_test['stock_id'].astype(str) + '-' + train_df_test['time_id'].astype(str)
# result = train_df_test[['row_id','target']].copy()
# result_dict = dict(zip(train_df_test['row_id'],train_df_test['target']))
# # result
# test_for_index = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
# test_for_index['target'] = list(test_for_index['row_id'].map(result_dict))
# test_for_index = test_for_index.drop(['stock_id','time_id'],1)
# test_for_index = test_for_index.fillna(method = 'ffill')
# test_for_index.to_csv('submission.csv',index = False)


In [None]:
train_df_groupby_class = train_df_test.groupby('classification')
result_pred_test = pd.DataFrame()
for class_,df_ in train_df_groupby_class:
    test_x = np.array(df_.copy().drop(['time_id','stock_id','classification'],1))
    pred_test_y = model_dict[class_].predict(test_x)
    df_['pred'] = pred_test_y
    result_pred_test = pd.concat([result_pred_test,df_[['stock_id','time_id','pred']]])

result_pred_test['row_id'] = result_pred_test['stock_id'].astype(str) + '-' + result_pred_test['time_id'].astype(str)
result = result_pred_test[['row_id','pred']].copy()
result_dict = dict(zip(result['row_id'],result['pred']))
# result
test_for_index = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test_for_index['target'] = list(test_for_index['row_id'].map(result_dict))
test_for_index = test_for_index.drop(['stock_id','time_id'],1)
test_for_index = test_for_index.fillna(method = 'ffill')
test_for_index.to_csv('submission.csv',index = False)


In [None]:
test_for_index