# Simple MRA

In [1]:
from pathlib import Path
import os
import warnings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

warnings.filterwarnings("ignore")

PREV_MAX = 80
if os.environ.get("KAGGLE_DATA_PROXY_TOKEN") != None:
    BASE_OUTPUT_PATH = Path(f'/kaggle/working')
    BASE_INPUT_PATH = Path(f'/kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')
    IS_OFFLINE = False

    # subprocess.check_call([sys.executable, "-m", "pip", "install", "talib_binary", "--no-index", "--find-links", "file:///kaggle/input/ta-lib/"])
else:
    BASE_OUTPUT_PATH = Path(f'../output')
    BASE_INPUT_PATH = Path(f'../kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')
    SAMPLE_SUBMISSION_FILE = Path(f'{BASE_INPUT_PATH}/sample_submission.csv')
    REVEALED_TARGETS_FILE = Path(f'{BASE_INPUT_PATH}/revealed_targets.csv')
    IS_OFFLINE = True
print(f"BASE_OUTPUT_PATH: {BASE_OUTPUT_PATH}")
print(f"BASE_INPUT_PATH: {BASE_INPUT_PATH}")
print(f"TRAIN_FILE: {TRAIN_FILE}")
print(f"TEST_FILE: {TEST_FILE}")
print(f"IS_OFFLINE: {IS_OFFLINE}")


BASE_OUTPUT_PATH: ../output
BASE_INPUT_PATH: ../kaggle/input/optiver-trading-at-the-close
TRAIN_FILE: ../kaggle/input/optiver-trading-at-the-close/train.csv
TEST_FILE: ../kaggle/input/optiver-trading-at-the-close/test.csv
IS_OFFLINE: True


In [2]:
from typing import Sequence, Tuple

import pandas as pd


class MockApi:
    def __init__(self):
        '''
        YOU MUST UPDATE THE FIRST THREE LINES of this method.
        They've been intentionally left in an invalid state.

        Variables to set:
            input_paths: a list of two or more paths to the csv files to be served
            group_id_column: the column that identifies which groups of rows the API should serve.
                A call to iter_test serves all rows of all dataframes with the current group ID value.
            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
        '''
        self.input_paths: Sequence[str] = [TEST_FILE, REVEALED_TARGETS_FILE, SAMPLE_SUBMISSION_FILE]
        self.group_id_column: str = 'time_id'
        self.export_group_id_column: bool = True
        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        '''
        Loads all of the dataframes specified in self.input_paths,
        then yields all rows in those dataframes that equal the current self.group_id_column value.
        '''
        if self._status != 'initialized':

            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            dataframes.append(pd.read_csv(pth, low_memory=False))
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                # returning single line dataframes from df.loc requires special handling
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        with open('submission.csv', 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        '''
        Accepts and stores the user's predictions and unlocks iter_test once that is done
        '''
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'


def make_env():
    return MockApi()

In [3]:
train_dataset = pd.read_csv(TRAIN_FILE)

In [4]:
features = ['imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 
            'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']

train_dataset_drop = train_dataset[features+['target', 'stock_id']].dropna()


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# 重回帰モデルの結果を格納するための辞書
model_stats = {}
models = {}

# 'stock_id'ごとにデータをループ処理
for stock_id, group_data in train_dataset_drop.groupby('stock_id'):
    # 目的変数と説明変数の分離
    X = group_data[features]
    y = group_data['target']

    # 説明変数に定数項（切片）を追加
    X = sm.add_constant(X)

    if stock_id == 0:
        print(X.head())
        print(X.shape)
        print(X.info())

    # OLS（最小二乗法）モデルの訓練
    model = sm.OLS(y, X).fit()

    # 重回帰モデルによる予測値の算出
    y_pred = model.predict(X)
    train_dataset_drop.loc[train_dataset_drop['stock_id'] == stock_id, 'target'] = y_pred

    # 各種統計値の取得
    aic = model.aic
    f_stat = model.fvalue
    r_squared = model.rsquared

    # 統計値を辞書に保存
    model_stats[stock_id] = {'AIC': aic, 'F-Statistic': f_stat, 'R-Squared': r_squared}
    # モデルを辞書に保存
    models[stock_id] = model


     const  imbalance_size  imbalance_buy_sell_flag  reference_price  \
0      1.0      3180602.69                        1         0.999812   
191    1.0      1299772.70                        1         1.000026   
382    1.0      1299772.70                        1         0.999919   
573    1.0      1299772.70                        1         1.000133   
764    1.0      1218204.43                        1         1.000455   

     matched_size  bid_price  bid_size  ask_price  ask_size       wap  
0     13380276.64   0.999812  60651.50   1.000026   8493.03  1.000000  
191   15261106.63   0.999812  13996.50   1.000026  23519.16  0.999892  
382   15261106.63   0.999812   4665.50   0.999919  12131.60  0.999842  
573   15261106.63   1.000026  55998.00   1.000133  46203.30  1.000085  
764   15342674.90   1.000241  14655.95   1.000455  26610.45  1.000317  
(26455, 10)
<class 'pandas.core.frame.DataFrame'>
Index: 26455 entries, 0 to 5237780
Data columns (total 10 columns):
 #   Column      

# Test

In [7]:
if IS_OFFLINE:
    env = make_env()
else:
    import optiver2023
    env = optiver2023.make_env()
iter_test = env.iter_test()


for (test, revealed_targets, sample_prediction) in iter_test:
    # 特徴量を追加し、stock_id でグループ化
    test = sm.add_constant(test[features + ['stock_id']], has_constant='add')
    grouped = test.groupby('stock_id', sort=True)  # stock_id でグループ化し、同時にソート

    predictions = []

    for stock_id, group in grouped:
        model = models.get(stock_id, None)
        if model is not None:
            # stock_id ごとの特徴量を取得
            X = group.drop(columns=['stock_id'])

            # モデルを使用して予測
            stock_predictions = model.predict(X)
            predictions.extend(stock_predictions)
        else:
            # stock_id に対応するモデルがない場合、デフォルトの値を設定
            predictions.extend([0.0] * len(group))

    # sample_prediction に予測結果をセット
    print(type(predictions[0]))
    sample_prediction['target'] = predictions
    

    # 予測を提出
    env.predict(sample_prediction)


AttributeError: 'float' object has no attribute 'dtype'