## ライブラリの読み込み

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## データの確認

In [None]:
stock_price_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

In [None]:
print('stock_price_df.shape: ', stock_price_df.shape)

In [None]:
stock_price_df.info()

## APIの利用方法

In [None]:
# import jpx_tokyo_market_prediction
# env = jpx_tokyo_market_prediction.make_env()
# iter_test = env.iter_test()

# count = 0
# for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
#     print(prices.head())
#     env.predict(sample_prediction)
#     count += 1
#     break

## 欠損値の確認

In [None]:
stock_price_df.isnull().sum()

## データサンプル

In [None]:
stock_price_df['SecuritiesCode'].tail()

In [None]:
# 9994 やまや
sample = stock_price_df[stock_price_df['SecuritiesCode'] == 9994]
print(sample.shape)
sample.head()

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["font.size"] = 15

In [None]:
# sample.index = sample['Date']

In [None]:
plt.figure(figsize=(14, 4))
sample['Close'].dropna().plot()
plt.grid(True)
plt.title('Close')
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.hist(sample['Target'], alpha=0.5, bins=30)
plt.grid(True)
plt.title('Target')
plt.show()

In [None]:
def make_dollar_bar(df):
    threshold = int(df['Volume'].dropna().values.mean() * df['Close'].dropna().values.mean() * 0.7)
    df_dropped = df.dropna(subset=['Close']).reset_index(drop=True)
#     print('欠損値削除後', df_dropped.shape)
    data_length = df_dropped.shape[0]
    data_list  = []
    df_dropped_list = df_dropped.to_dict('record')
    price = 0
    for i in range(data_length):
        close_data = df_dropped_list[i]['Close']
        volume_data = df_dropped_list[i]['Volume']
        price += close_data * volume_data
        if price > threshold:
            data_list.append(df_dropped_list[i])
            price = 0
    return pd.DataFrame(data_list)

In [None]:
dollar = make_dollar_bar(sample)
dollar.shape

In [None]:
dollar.head(10)

In [None]:
dollar["Close_shift1"] = dollar["Close"].shift(-1)
dollar["Close_shift2"] = dollar["Close"].shift(-2)

dollar["rate"] = (dollar["Close_shift2"] - dollar["Close_shift1"]) / dollar["Close_shift1"]

In [None]:
plt.figure(figsize=(10, 4))
plt.hist(sample['Target'], alpha=0.5, bins=30, density=True, label='TimeBar')
plt.hist(dollar['rate'], alpha=0.5, bins=30, density=True, label='DollarBar')
plt.legend()
plt.grid(True)
plt.title('rate')
plt.show()

## 移動平均線

In [None]:
periods = [5, 25, 75]
ma_cols = ["Close"]
for period in periods:
    col = "{}MA".format(period)
    dollar[col] = dollar['Close'].rolling(period, min_periods=1).mean()
    ma_cols.append(col)

# プロット
fig, ax = plt.subplots(figsize=(20, 8))

for col in ma_cols:
    ax.plot(dollar[col], label=col)
ax.set_ylabel("Price")
# ax.set_xlabel("Date")
ax.grid(True)
ax.legend()
plt.show()

In [None]:
dollar.head()

## 価格変化率

In [None]:
# 5日、25日、75日の価格変化率を算出
periods = [5, 25, 75]
return_cols = []
for period in periods:
    col = "{}PriceChangeRate".format(period)
    dollar[col] = dollar["Close"].pct_change(period) * 100
    return_cols.append(col)

# プロット
fig, ax = plt.subplots(figsize=(20, 8))
    
for col in return_cols:
    ax.plot(dollar[col], label=col)
ax.set_ylabel("PriceChangeRate (%)",fontsize=16)
ax.grid(True)
ax.legend()
plt.show()

## ヒストリカル・ボラティリティ

In [None]:
periods = [5, 25, 75]
vol_cols = []
for period in periods:
    col = "{}Volatility".format(period)
    dollar[col] = dollar["Close"].diff().rolling(period).std()
    vol_cols.append(col)

# プロット
fig, ax = plt.subplots(figsize=(20, 8))
    
for col in vol_cols:
    ax.plot(dollar[col], label=col)
ax.set_ylabel("Volatility")
ax.grid(True)
ax.legend()
plt.show()

In [None]:
dollar.columns

In [None]:
fig, ax = plt.subplots(nrows=3 ,figsize=(30, 12))

for col in ma_cols:
    ax[0].plot(dollar[col], label=col)

for col in return_cols:
    ax[1].plot(dollar[col], label=col)
    
for col in vol_cols:
    ax[2].plot(dollar[col], label=col)
    
ax[0].set_ylabel("Price",fontsize=16)
ax[1].set_ylabel("PriceChangeRate(%)")
ax[2].set_ylabel("Volatility")
    
for _ax in ax:
    _ax.grid(True)
    _ax.legend()

## 特徴量の欠損値処理

In [None]:
feature_columns = dollar.columns[-9:]
dollar[feature_columns] = dollar[feature_columns].fillna(0)
dollar[feature_columns] = dollar[feature_columns].replace([np.inf, -np.inf], 0)

In [None]:
dollar.isnull().sum()

In [None]:
def make_features(df):
    periods = [5, 25, 75]
    for period in periods:
        col = "{}MA".format(period)
        df[col] = df['Close'].rolling(period, min_periods=1).mean()
    
    periods = [5, 25, 75]
    for period in periods:
        col = "{}PCR".format(period)
        df[col] = df["Close"].pct_change(period) * 100
        
    periods = [5, 25, 75]
    vol_cols = []
    for period in periods:
        col = "{}Volatility".format(period)
        df[col] = df["Close"].diff().rolling(period).std()
    
    feature_columns = df.columns[-9:]
    df[feature_columns] = df[feature_columns].fillna(0)
    df[feature_columns] = df[feature_columns].replace([np.inf, -np.inf], 0)
    
    return df

In [None]:
code = stock_price_df['SecuritiesCode'].unique()
code.shape

In [None]:
from tqdm import tqdm

In [None]:
df = pd.DataFrame()
for c in tqdm(code):
    dollar = make_dollar_bar(stock_price_df[stock_price_df['SecuritiesCode'] == c])
    dollar = make_features(dollar)
    df = pd.concat([df, dollar], axis=0)

In [None]:
# print('元データ', stock_price_df.shape)
print('変換後', df.shape)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.tail(10)

In [None]:
# df.to_csv('/kaggle/working/my_train.csv', index=False)

## データセットの作成

In [None]:
# df = pd.read_csv('./my_train.csv')

In [None]:
code = np.sort(df['SecuritiesCode'].unique())
code

In [None]:
list(code).index(1301)

##  学習

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [None]:
df.columns

In [None]:
def trainer(df):
#     c = ['Open', 'High', 'Low', 'Close', 'Volume', '5MA', '25MA', '75MA', '5PCR', '25PCR', '75PCR', '5Volatility', '25Volatility', '75Volatility']
    c = ['Open', 'High', 'Low', 'Close', 'Volume']
    X = df[c]
    Y = df['Target']
    # 訓練データとバリデーションデータに分割
    X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.2, shuffle=False)
    
    SEED = 42

    model = lgb.LGBMRegressor(
        random_state = SEED,
    )

    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_validation, y_validation), (X_train, y_train)],
        verbose=-1,
    )

    # 学習履歴の表示
#     lgb.plot_metric(model)
    
    return model

In [None]:
model_list = []

for i in tqdm(code):
    data_df = df[df['SecuritiesCode'] == i]
    model = trainer(data_df)
    model_list.append(model)

In [None]:
len(model_list)

## 提出

In [None]:
past_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv')

In [None]:
# 時系列APIのロード
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
def merge_data(prices, options, financials, trades, secondary_prices, stock_list):
    # stock_prices がベース
    base_df = prices.copy()
    
    # stock_listと結合
    _stock_list = stock_list.copy()
    _stock_list.rename(columns={'Close': 'Close_x'}, inplace=True)
    base_df = base_df.merge(_stock_list, on='SecuritiesCode', how="left")

    # tradesと結合
    # stock_listのNewMarketSegmentと紐づくよう、tradesのSection項目を編集する
    # _trades = trades.copy()
    # _trades['NewMarketSegment'] = _trades['Section'].str.split(' \(', expand=True)[0]
    # base_df = base_df.merge(_trades, on=['Date', 'NewMarketSegment'], how="left")

    # financialsと結合
    # _financials = financials.copy()
    # _financials.rename(columns={'Date': 'Date_x', 'SecuritiesCode': 'SecuritiesCode_x'}, inplace=True)
    # base_df = base_df.merge(_financials, left_on='RowId', right_on='DateCode', how="left")
    
    return base_df

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    # price.set_index("Date", inplace=True)
    return price

In [None]:
def collector(prices, options, financials, trades, secondary_prices):
    # 読み込んだデータを統合して一つのファイルに纏める
    base_df = merge_data(prices, options, financials, trades, secondary_prices)
    # AdjustedClose項目の生成
    base_df = adjust_price(base_df)
    
    return base_df

In [None]:
def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:
df.head()

In [None]:
# feature_df = feature_df[feature_df['Date'] == current_date]
    
# result_df = pd.DataFrame()
# for s in feature_df['SecuritiesCode'].unique():
#     data = feature_df[feature_df['SecuritiesCode'] == s]
#     model = model_list[list(code).index(int(s))]
#     c = ['Open', 'High', 'Low', 'Close', 'Volume', '5MA', '25MA', '75MA', '5PCR', '25PCR', '75PCR', '5Volatility', '25Volatility', '75Volatility']
#     preds = model.predict(data[c])
#     data['pred'] = preds
#     result_df = pd.concat([result_df, data])

# result_df.reset_index(drop=True, inplace=True)

# # 推論結果からRANKを導出し、提出データに反映
# result_df = add_rank(result_df)
# feature_map = result_df.set_index('SecuritiesCode')['Rank'].to_dict()
# sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

# # 結果を登録
# env.predict(sample_prediction)

i = 0
for (prices, options, financials, trades, secondary_prices, sample_prediction) in tqdm(iter_test):
    current_date = prices["Date"].iloc[0]
    
    if i == 0:
        # リークを防止するため、時系列APIから受け取ったデータより未来のデータを削除
        past_df = past_df[past_df["Date"] < current_date]
    i += 1
    # リソース確保のため古い履歴を削除
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    past_df = past_df[past_df["Date"] >= threshold]
    
    # 時系列APIから受け取ったデータを履歴データに統合
#     base_df = collector(prices, options, financials, trades, secondary_prices)
    past_df = pd.concat([past_df, prices]).reset_index(drop=True)
    feature_df = make_features(past_df)
    feature_df = feature_df[feature_df['Date'] == current_date]
    
    result_df = pd.DataFrame()
    for s in feature_df['SecuritiesCode'].unique():
        data = feature_df[feature_df['SecuritiesCode'] == s]
        model = model_list[list(code).index(int(s))]
#         c = ['Open', 'High', 'Low', 'Close', 'Volume', '5MA', '25MA', '75MA', '5PCR', '25PCR', '75PCR', '5Volatility', '25Volatility', '75Volatility']
        c = ['Open', 'High', 'Low', 'Close', 'Volume']

        preds = model.predict(data[c])
        data['pred'] = preds
        result_df = pd.concat([result_df, data])
    
    result_df.reset_index(drop=True, inplace=True)
    
    # 推論結果からRANKを導出し、提出データに反映
    result_df = add_rank(result_df)
    feature_map = result_df.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

    # 結果を登録
    env.predict(sample_prediction)

In [None]:
sample_prediction