In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

# https://zhuanlan.zhihu.com/p/180347090
from joblib import Parallel, delayed

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2'] + df['ask_size2']
    
    x = (a1/b1 + a2/b2)/ 2
    
    return x


def calculate_wap2(df):
        
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    
    x = (a1 + a2)/ b
    return x

def calculate_wap3(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    x = a1/b1
    return x

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

from sklearn.metrics import r2_score

In [None]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               

    book_train_subset['wap'] = calculate_wap(book_train_subset)

    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    
    stock_stat.insert(0, "stock_id", stock_id)  #第一列插入
    
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
#book_train_subset = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
#book_train_subset

In [None]:
#train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
#train_dataSet = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
#train_dataSet.to_csv("optiver-realized-volatility-datasets.csv",index=False)
#train_dataSet

In [None]:
train_dataSet = pd.read_csv("../input/optiverrealizedvolatilitydatasets/optiver-realized-volatility-datasets.csv")
train_dataSet.head()

In [None]:
df = train_dataSet[['target','log_return','bas']]
df

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(8,5))
x_data, y_data = (df["log_return"].values, df["target"])
plt.plot(x_data, y_data, 'ro')
plt.ylabel('target')
plt.xlabel('log_return')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
x_data, y_data = (df["log_return"], df["target"])
plt.plot(x_data - y_data, 'ro')
plt.ylabel('target')
plt.xlabel('log_return')
plt.show()

In [None]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

# Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
#train_x = np.asanyarray(train[['log_return','bas']])
#train_y = np.asanyarray(train[['target']])
train_x = train[['log_return','bas']]
train_y = train['target']

test_x = test[['log_return','bas']]
test_y = test['target']

poly = PolynomialFeatures(degree=3)
train_x_poly = poly.fit_transform(train_x)
train_x_poly

In [None]:
weights = 1/np.square(train.target)

clf = linear_model.LinearRegression()
train_y_ = clf.fit(train_x_poly, train_y, sample_weight = weights)
# The coefficients
print ('Coefficients: ', clf.coef_)
print ('Intercept: ',clf.intercept_)

In [None]:
len(clf.coef_)

In [None]:
test_x_poly = poly.fit_transform(test_x)
test_x_poly

# Evaluation

In [None]:
from sklearn.metrics import r2_score

test_x_poly = poly.fit_transform(test_x)
test_y_ = clf.predict(test_x_poly)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,test_y_ ) )

# Predict

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet = test_dataSet.drop(['stock_id', 'time_id'], axis = 1)

y_pred = test_dataSet[['row_id']]
X_test = test_dataSet.drop(['row_id'], axis = 1).fillna(0)
X_test

In [None]:
X_test_poly = poly.fit_transform(X_test)
X_test_poly

In [None]:
#y_pred = y_pred.assign(target = regr.predict(X_test))
y_pred = y_pred.assign(target = clf.predict(X_test_poly))
y_pred.to_csv('submission.csv',index = False)

In [None]:
y_pred