In [None]:
import os
import warnings, gc
import numpy as np 
import pandas as pd
import matplotlib.colors
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
from decimal import ROUND_HALF_UP, Decimal
from tqdm import tqdm
import pickle
import jpx_tokyo_market_prediction

warnings.filterwarnings("ignore")
import plotly.figure_factory as ff

init_notebook_mode(connected = True)
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors = px.colors.qualitative.Plotly

# What to do in this Notebook
Compute features by Fractional difference introduced in Chapter 5 of "Advances in Financial Machine Learning" Part 1, train models, and perform submit.  
The functions that define the calculation of fractional differences are getWeights_FFD and fracDiff_FFD, which are called from calc_feature_per_SecuritiesCode.


ファイナンス機械学習Part1の5章で紹介された分数次差分による特徴量でモデルを学習してsubmitする。  
分数次差分の計算を定義している関数はgetWeights_FFDおよびfracDiff_FFDで、これらをcalc_feature_per_SecuritiesCodeから呼び出している。

# What is Fractional difference ? 
Price series are not suitable to be input directly into the ML model because the distribution of values is not constant.  
One method to make the distribution of features constant is to take the factorial difference (=df.Close.diff() or df.Close/df.Close.shift()-1), but this method removes more information than necessary from the prices themselves.   
Therefore, fractional difference is used as a method to make the distribution constant without losing more information than necessary.  
  
価格系列は値の分布が一定でないため、そのままMLモデルに入力するには不向きである。  
分布を一定にする方法として階差を取る(=df.Close.diff() or df.Close/df.Close.shift()-1)方法があるが、この方法は価格自体が持つ情報を必要以上に削いでしまう。  
そこで必要以上に情報を落とすことなくに分布を一定にする方法として分数次差分を使う。


# but,,,,
Very time consuming to process and cannot be used for private submission.

処理に非常に時間がかかり、private向けのsubmitには使えない

# Load Data

In [None]:
data_dir="../input/jpx-tokyo-stock-exchange-prediction"
df_price=pd.read_csv(os.path.join(data_dir,'train_files','stock_prices.csv'))
df_price=pd.concat([df_price,pd.read_csv(os.path.join(data_dir,'supplemental_files','stock_prices.csv'))]).reset_index()

# Preprocessing

In [None]:
def adjust_price(df_price):
    def generate_adjusted_features(df):
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending = True)
        
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod().shift(1).fillna(1.0)
        
        # generate Adjusted prices
        for column in ["Close", "Open", "High", "Low", "Volume"]:
            df.loc[:, column] = (
                df[column] / df["CumulativeAdjustmentFactor"] if column != "Volume" else df[column] * df["CumulativeAdjustmentFactor"]
            ).map(lambda x: float(
                Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)))
            df.loc[df[column] == 0, column] = np.nan
            
            # NaN handling
            if column == "Close":
                # First, forward fill for Close
                df.loc[:, column] = df.loc[:, column].ffill()
            elif column == "High" or column == "Low" or column == "Open":
                # High, Low and Open should be the same as Close
                df.loc[np.isnan(df[column]), column] = df.loc[np.isnan(df[column]), 'Close']
            else:
                # Volume should be 0
                df.loc[np.isnan(df[column]), column] = 0
            
        return df
    df_price["ExpectedDividend"].fillna(0,inplace=True)
    df_price = df_price.sort_values(["SecuritiesCode", "Date"], ascending = True)
    df_price = df_price.groupby("SecuritiesCode").apply(generate_adjusted_features).reset_index(drop = True)
    return df_price

In [None]:
%%time
df_price=adjust_price(df_price)
df_price.isna().sum()

# Calc Features

In [None]:
def getWeights_FFD(d,th):
    """
    calc wight
    d:frac
    th:threshold
    """
    w,k = [1,],1
    while True:
        w_=-w[-1]/k*(d-k+1)
        if abs(w_)<th:break
        w.append(w_);k+=1
    w=np.array(w[::-1]).reshape(-1,1)
    return w

def fracDiff_FFD(df_,d=0.4,th=1e-4):
    """
    calc fracDiff feature 
    d:frac
    th:tthreshold
    """
    w=getWeights_FFD(d,th)
    width=len(w)-1
    values=np.ones(df_.shape)
    values[:,:]=np.nan
    df=df_.fillna(method='ffill').dropna()
    cols=[c for c in df.columns]
    df.reset_index(inplace=True)
    df=df[cols]
    for iloc in range(width,df.shape[0]):
        loc0,loc1=df.index[iloc-width],df.index[iloc]
        values[iloc]=np.dot(w.T,df.loc[loc0:loc1])[0]
        continue    

        try:
            values[iloc]=np.dot(w.T,df.loc[loc0:loc1])[0]
        except Exception as e:
            print('='*20,'ERROR','='*20)
            print(e)
            print(loc0,loc1,width)
            print(df.index)
            vaues[iloc]=0
    ret_df=pd.DataFrame(values)
    ret_df.columns=df.columns
    return ret_df

def calc_feature_per_SecuritiesCode(df_price):
    """
    calc simple features & FracDiff features
    """
    # simple features
    df_price.sort_values(by='Date',inplace=True)
    for i in range(1,5+1):
        df_price[f'ror{i}']=df_price['Close']/df_price['Close'].shift(i)-1
        df_price[f'high{i}']=df_price['Close']/df_price['High'].rolling(i).max()-1
        df_price[f'low{i}']=df_price['Close']/df_price['Low'].rolling(i).min()-1
    df_price['ma5']=df_price['Close']/df_price['Close'].rolling(5).mean()-1
    df_price['ma10']=df_price['Close']/df_price['Close'].rolling(10).mean()-1
    df_price['vma5']=df_price['Volume']/df_price['Volume'].rolling(5).mean()-1
    df_price['vma10']=df_price['Volume']/df_price['Volume'].rolling(10).mean()-1
    
    
    # FracDiff feature
    cols=['Open','High','Low','Close']
    df_frac=fracDiff_FFD(df_price[cols])
    df_frac.columns=[f'{col}_frac' for col in cols]
    for c in df_frac.columns:df_price.loc[:,c]=df_frac[c].values
    return df_price
def calc_feature(df_price):
    df_price=df_price.groupby('SecuritiesCode').apply(lambda df:calc_feature_per_SecuritiesCode(df))
    return df_price


In [None]:
%%time
#df_price=calc_feature(df_price[df_price['SecuritiesCode']]<3000)
df_price=calc_feature(df_price)
df_price.isna().sum()

# Train Model

In [None]:
TRAIN_FROM='2017-01-01'
TRAIN_TO='2021-09-01'
VAL_FROM='2021-09-01'
VAL_TO='2021-12-01'
TEST_FROM='2021-12-01'
TEST_TO='2022-03-01'
df_price.dropna(inplace=True)
train_idx=df_price.index[(TRAIN_FROM<=df_price.Date)&(df_price.Date<TRAIN_TO)]
val_idx=df_price.index[(VAL_FROM<=df_price.Date)&(df_price.Date<VAL_TO)]
test_idx=df_price.index[(TEST_FROM<=df_price.Date)&(df_price.Date<TEST_TO)]
features=['ror1', 'high1', 'low1', 'ror2', 'high2', 'low2', 'ror3', 'high3', 'low3',
          'ror4', 'high4', 'low4', 'ror5', 'high5', 'low5', 
          'ma5', 'ma10', 'vma5','vma10', 'Open_frac', 'High_frac', 'Low_frac', 'Close_frac']
y='Target'

In [None]:
%%time
lgb_params={
    'random_state': 0,
    'verbose': -1, 
    'max_depth': 3,
    'objective': 'regression',
    'boosting': 'dart', 
    'boosting_type': None,
    'early_stopping_round': 10   
}

model=lgb.LGBMRegressor(**lgb_params)
model.fit(
    df_price.loc[train_idx,features],df_price.loc[train_idx,y],
    eval_set=[(df_price.loc[val_idx,features], df_price.loc[val_idx,y])],
    verbose=-1,
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=-1)]
)

# Submit
Very time consuming to process and cannot be used for private submission.

In [None]:
%%time
df_price['pred']=model.predict(df_price[features])
env = jpx_tokyo_market_prediction.make_env()
for (prices, options, financials, trades, secondary_prices, sample_prediction) in env.iter_test():
    sample_prediction=pd.merge(sample_prediction,df_price[['Date','SecuritiesCode','pred']],on=['Date','SecuritiesCode'],how='left')
    sample_prediction['pred'].fillna(0,inplace=True)
    sample_prediction['Rank']=len(sample_prediction['pred'])-sample_prediction['pred'].rank(method='first').astype(np.int16)
    env.predict(sample_prediction[['Date','SecuritiesCode','Rank']])