In [None]:
import numpy as np 
import pandas as pd 
from typing import List, Iterator, Callable
from time import time
import glob
import os
from functools import lru_cache
from sklearn.metrics import r2_score
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.preprocessing import PolynomialFeatures
from utils import DataLoader

import statsmodels.api as sm

In [None]:
MAX_TIMEIDS = None
MAX_STOCKS = None
# MAX_TIMEIDS = 10
# MAX_STOCKS = 30
data_loader = DataLoader(MAX_TIMEIDS, False)

In [None]:
train_df = data_loader.read_labels(test=False)
if MAX_STOCKS and MAX_STOCKS < len(train_df["stock_id"].unique()):
    print("Sampling {} stocks for training.".format(MAX_STOCKS))
    sampled_stocks = np.random.choice(train_df["stock_id"].unique(), size=MAX_STOCKS, replace=False)
    train_df = train_df[train_df["stock_id"].isin(sampled_stocks)]

In [None]:
# helpers to extract features from book
def calc_realized_volatility(log_returns: pd.Series) -> float:
    return np.sqrt(np.sum(log_returns ** 2))


def aggregate_book_for_stock_and_time_id(book_time_slice: pd.DataFrame) -> pd.Series:
    volatilities = dict()
    for i in range(1, 4):
        log_returns = np.log(book_time_slice[f"WAP{i}"]).diff()
        volatilities[f"volatility_{i}"] = calc_realized_volatility(log_returns)
        
        log_returns = np.log(book_time_slice.tail(100)[f"WAP{i}"]).diff()
        volatilities[f"volatility_tail_{i}"] = calc_realized_volatility(log_returns)

    return pd.Series(volatilities)
  
    

def aggregate_book_for_stock(df: pd.DataFrame) -> pd.DataFrame:
    df["WAP1"] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    df["WAP2"] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2']+ df['ask_size2'])
    df["WAP3"] = (
        (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) +
        (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])
    ) / (
        (df['bid_size1']+ df['ask_size1']) + (df['bid_size2']+ df['ask_size2'])
    )
    
    start = time()
    
    g = df.groupby(["time_id"], as_index=False)
    result = g.apply(aggregate_book_for_stock_and_time_id)
#     print(time() - start, "seconds to aggregate book per time_id")
    return result

def get_book_features_for_all_stocks(stock_ids, test):
    features_all_books = list()
    print("Getting book features for {} stocks.".format(len(stock_ids)))
    for stock_id in tqdm(stock_ids):
        book_df = data_loader.read_book(stock_id, test=test)
        features_this_book = aggregate_book_for_stock(book_df)
        features_this_book.insert(0, "stock_id", stock_id)
        features_all_books.append(features_this_book)

    features_all_books = pd.concat(features_all_books)
    return features_all_books
    


In [None]:
# get book features per stock
book_features_train = get_book_features_for_all_stocks(train_df["stock_id"].unique(), False)

In [None]:
train_enriched = train_df.merge(book_features_train, how="inner")
train_enriched.head()

In [None]:
def make_X(feature_df):
    stock_dummies = pd.get_dummies(feature_df[["stock_id"]])
    return pd.concat([
        feature_df[["volatility_1"]],
        stock_dummies
    ], 
        axis=1
    )

## Cross Validate Model

In [None]:
def rmspe(y_true: pd.Series, y_pred: pd.Series) -> float:
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))  

def score_model_on_rmspe(model, X: pd.DataFrame, y_true: pd.Series) -> float:
    y_pred = model.predict(X)
    return rmspe(y_true, y_pred)

In [None]:
X_train = make_X(train_enriched)
y = train_enriched["target"]

In [None]:
model = LinearRegression(fit_intercept=False)

n_splits = 10
cv_results = cross_val_score(model, X_train, y, scoring=score_model_on_rmspe, cv=n_splits)
print("RMSPE from cross-validation:")
print(round(cv_results.mean(), 3),  " +/-", round(1.96 * cv_results.std(), 2))

## Final Fit and Predict

In [None]:
model.fit(X_train, y)

In [None]:
# Prepare features for test
test_df = data_loader.read_labels(test=True)
book_features_test = get_book_features_for_all_stocks(test_df["stock_id"].unique(), test=True)
test_enriched = test_df.merge(book_features_test, how="left") # to fail early I don't use inner join

In [None]:
print("Missing value prevalence in X_test (will be mean-imputed):")
na_pct = test_enriched.isnull().mean()
print(na_pct[na_pct>0])
test_enriched = test_enriched.fillna(test_enriched.mean(numeric_only=True))

In [None]:
X_test = make_X(test_enriched)
y_predicted = model.predict(X_test)

In [None]:
def prepare_submissions(test_df, y_predicted):
    # TODO: ensure that y_predicted is aligned with test_df
    test_df["target"] = y_predicted
    
    submission_df = test_df[["row_id", "target"]].fillna(test_df["target"].mean())
    submission_df.to_csv('submission.csv',index = False)
    print("submissions prepared - done")
    
prepare_submissions(test_df, y_predicted)

In [None]:

#     stock_ids = test_df["stock_id"].unique()

In [None]:
# # simple utils
# def flatten_hierachical_column_index(df: pd.DataFrame) -> None:
#     df.columns = ['_'.join(col).strip() for col in df.columns.values]
    
# def columns_are_primary_key(df: pd.DataFrame, colnames: List[str]) -> bool:
#     if not df[colnames].duplicated().any():
#         print(f"({', '.join(colnames)}) is a primary key")
#         return True

In [None]:

 
    
# def aggregate_trades_for_stock_and_time_id(trade_time_slice: pd.DataFrame) -> pd.Series:
#     trade_time_slice = trade_time_slice.assign(
#         trade_price_1 = lambda df: df.price
#     )
#     volatilities = dict()
#     for i in range(1, 2):
#         log_returns = np.log(trade_time_slice[f"trade_price_{i}"]).diff()
#         volatilities[f"trade_volatility_{i}"] = calc_realized_volatility(log_returns)

#     return pd.Series(volatilities)
    
# def aggregate_trades_for_stock(df: pd.DataFrame) -> pd.DataFrame:
#     g = df.groupby(["time_id"], as_index=False)
#     result = g.apply(aggregate_trades_for_stock_and_time_id)
#     result["stock_id"] = df["stock_id"].iloc[0]
#     return result
    
    
# # @lru_cache(maxsize=MAX_CACHE)
# def get_stock_x_time_df_by_stock_id(stock_id: int, test: bool, verbose: bool =True) -> pd.DataFrame:
#     book_df = df_from_parquet_for_stock(stock_id, test=test, verbose=verbose)
#     book_agg = aggregate_book_for_stock(book_df)
    
#     trade_df = df_from_parquet_for_stock(stock_id, test=test, book_or_trade="trade", verbose=verbose)
#     trade_agg = aggregate_trades_for_stock(trade_df)
    
#     joined = book_agg.merge(trade_agg, on=["time_id", "stock_id"], how="left")

#     return joined


# def iter_stock_x_time_dfs(stock_ids: int, test: bool, verbose: bool =False) -> Iterator[pd.DataFrame]:
#     for stock_id in tqdm(stock_ids):
#         yield get_stock_x_time_df_by_stock_id(stock_id, test=test, verbose=verbose)
        
        
    
# get_stock_x_time_df_by_stock_id(37, test=False, verbose=False)    


### Compile Training Data

In [None]:
# train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
# # 
# columns_are_primary_key(train_df, ["stock_id", "time_id"])

In [None]:
# # compile training df
# possible_stocks = train_df["stock_id"].unique()
# stock_ids = possible_stocks[:MAX_STOCKS_FOR_TRAINING]

# prepared_training_dfs = list()
# for i, stock_x_time_df in enumerate(iter_stock_x_time_dfs(stock_ids, test=False)):
#     if i >= MAX_STOCKS_FOR_TRAINING:
#         break     
#     prepared_training_dfs.append(stock_x_time_df)

    
# prepared_training_df = pd.concat(prepared_training_dfs)
# prepared_training_df = prepared_training_df.merge(train_df, on=["time_id", "stock_id"], how="inner")
# prepared_training_df.describe()

In [None]:
# # check if there are any null values
# null_count = prepared_training_df.isnull().sum()
# null_count[null_count > 0].to_frame("null count")

In [None]:
# prepared_training_df = prepared_training_df.fillna(prepared_training_df.mean())

### Look at simple OLS regression stats for selected features

In [None]:
# feature_names = ["volatility_1", "volatility_tail_1"]
# y = prepared_training_df["target"]
# X = prepared_training_df[feature_names]

In [None]:
# lm = sm.OLS(endog=y, exog=X)
# lm = lm.fit()
# lm.summary()

### Cross validate model using target metric

In [None]:
# simple_model = LinearRegression(fit_intercept=False)

# n_splits = 10
# cv_results = cross_val_score(simple_model, X, y, scoring=score_model_on_rmspe, cv=n_splits)
# print("OLS on target - RMSPE from cross-validation:")
# print(round(cv_results.mean(), 3),  " +/-", round(1.96 * cv_results.std(), 2))

In [None]:
# pf = PolynomialFeatures(interaction_only=True, include_bias=False)
# X_plus = pf.fit_transform(X)
# cv_results = cross_val_score(simple_model, X_plus, y, scoring=score_model_on_rmspe, cv=n_splits)
# print("OLS on target - RMSPE from cross-validation:")
# print(round(cv_results.mean(), 3),  " +/-", round(1.96 * cv_results.std(), 2))

## Predict on test set and submit

In [None]:
# def predict_and_prepare_submission(fitted_model) -> pd.DataFrame:
    
#     test_df = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
#     stock_ids = test_df["stock_id"].unique()
# #     print(test_df)
#     predictions = list()
#     for stock_x_time_df in iter_stock_x_time_dfs(stock_ids, test=True):
# #         print(stock_x_time_df)
#         X = stock_x_time_df[feature_names].copy()
#         X.fillna(0, inplace=True)
#         y_pred = fitted_model.predict(X)
#         stock_x_time_df["target"] = y_pred
        
#         predictions.append(stock_x_time_df[["stock_id", "time_id", "target"]])
        
#     predictions = pd.concat(predictions)
#     test_df = test_df.merge(predictions, on=["time_id", "stock_id"], how="left")
#     return test_df[["row_id", "target"]].fillna(test_df["target"].mean())

In [None]:
# simple_model.fit(X, y)

In [None]:
# submission_df = predict_and_prepare_submission(simple_model)
# submission_df.to_csv('submission.csv',index = False)