# このノートブックは？
* CreateInitialData.ipynbで作成したデータを利用し、特徴量作成、MLPでの学習、予測結果のファイル出力を行う
* NASDAQ, AMEXを対象に学習し、精度が劣るためこの予測結果は利用しないこととした

## Google Driveのマウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'/'PROBSPACE'/'StockPricePrediction'

Mounted at /content/drive
/content/drive/My Drive/PROBSPACE/StockPricePrediction


## ライブラリのインストール、インポート

In [None]:
import src.install_libraries
import category_encoders as ce
from src.common_functions import *
from src.create_features import *
import tensorflow as tf
from tensorflow.keras import layers, activations
!pip install -q -U tensorflow_addons
import tensorflow_addons as tfa
from collections import defaultdict

Collecting category_encoders
  Downloading category_encoders-2.3.0-py2.py3-none-any.whl (82 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.3.0
Collecting ta
  Downloading ta-0.8.0.tar.gz (24 kB)
Building wheels for collected packages: ta
  Building wheel for ta (setup.py): started
  Building wheel for ta (setup.py): finished with status 'done'
  Created wheel for ta: filename=ta-0.8.0-py3-none-any.whl size=28895 sha256=81fee465a120cf3028514512088aebc13182dbd824cd90a92731a6052ff6f7e2
  Stored in directory: /root/.cache/pip/wheels/7e/da/86/65cba22446ae2ef148de2079907264ef27feecfb7f51a45e0d
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.8.0


  import pandas.util.testing as tm


[?25l[K     |▎                               | 10 kB 23.6 MB/s eta 0:00:01[K     |▋                               | 20 kB 28.2 MB/s eta 0:00:01[K     |▉                               | 30 kB 32.9 MB/s eta 0:00:01[K     |█▏                              | 40 kB 23.6 MB/s eta 0:00:01[K     |█▌                              | 51 kB 15.3 MB/s eta 0:00:01[K     |█▊                              | 61 kB 14.7 MB/s eta 0:00:01[K     |██                              | 71 kB 15.8 MB/s eta 0:00:01[K     |██▍                             | 81 kB 17.1 MB/s eta 0:00:01[K     |██▋                             | 92 kB 15.0 MB/s eta 0:00:01[K     |███                             | 102 kB 12.8 MB/s eta 0:00:01[K     |███▎                            | 112 kB 12.8 MB/s eta 0:00:01[K     |███▌                            | 122 kB 12.8 MB/s eta 0:00:01[K     |███▉                            | 133 kB 12.8 MB/s eta 0:00:01[K     |████▏                           | 143 kB 12.8 MB/s eta 0:

## データの読み込み

In [None]:
price_df2, company_df2, submission_df = read_initial_data()

In [None]:
# 乱数固定
SEED = 42
seed_everything(SEED)

set_pandas_options()

## 特徴量追加

In [None]:
def create_features(df, group_col, val_col1, val_col2, prefix):
  res = pd.DataFrame(index=df.index)
  group_df = df.groupby(group_col) if group_col else df.copy()

  # ラグ特徴量
  with Timer(f'{prefix}ラグ特徴量'):
    lag_df = create_lags(group_df, val_col1, [1, 2, 3, 4], prefix)
    res = res.join(lag_df)

  # 単純移動平均、標準偏差
  with Timer(f'{prefix}単純移動平均'):
    roll_df = create_rolls(group_df, val_col1, [1], [52], prefix)
    res = res.join(roll_df)

  # EMA GAP
  with Timer(f'{prefix}EMA_GAP'):
    ema_df = create_ema_gap(df, group_df, group_col, val_col1, [13, 26], prefix)
    res = res.join(ema_df)

  # # Return
  # with Timer(f'{prefix}Return'):
  #   return_df = create_return(group_df, val_col2, [13, 26], prefix)
  #   res = res.join(return_df)

  # HV
  with Timer(f'{prefix}HV'):
    hv_df = create_hv(df, group_df, group_col, val_col2, [13, 26], prefix)
    res = res.join(hv_df)

  # # Skew
  # with Timer(f'{prefix}Skew'):
  #   skew_df = create_skew(df, group_df, group_col, val_col2, [13, 26], prefix)
  #   res = res.join(skew_df)

  # Kurt
  with Timer(f'{prefix}Kurt'):
    kurt_df = create_kurt(group_df, val_col2, [13, 26], prefix)
    res = res.join(kurt_df)

  # # Var
  # with Timer(f'{prefix}Var'):
  #   var_df = create_var(group_df, val_col1, [13, 26], prefix)
  #   res = res.join(var_df)

  # # MIN, MAX
  # if group_col:
  #   with Timer(f'{prefix}MIN, MAX'):
  #     min_max_df = create_min_max(group_df, val_col1, [13, 26], prefix)
  #     res = res.join(min_max_df)

  # # Up, Down
  # if group_col:
  #   with Timer(f'{prefix}Up, Down'):
  #     up_down_df = create_up_down(df, group_col, val_col1, [13, 26], prefix)
  #     res = res.join(up_down_df)

  # # ATR
  # if group_col:
  #   with Timer(f'{prefix}ATR'):
  #     atr_df = create_atr(group_df, val_col2, prefix)
  #     res = res.join(atr_df)

  # # RSI
  # if group_col:
  #   with Timer(f'{prefix}RSI'):
  #     rsi_df = create_rsi(group_df, val_col2, prefix)
  #     res = res.join(rsi_df)

  # # MACD
  # if group_col:
  #   with Timer(f'{prefix}MACD'):
  #     macd_df = create_macd(group_df, val_col2, prefix)
  #     res = res.join(macd_df)

  # # BB
  # if group_col:
  #   with Timer(f'{prefix}BB'):
  #     bb_df = create_bb(group_df, val_col2, prefix)
  #     res = res.join(bb_df)

  # # Stochastic Oscillator
  # if group_col:
  #   with Timer(f'{prefix}Stochastic Oscillator'):
  #     stock_df = create_stock(group_df, val_col2, prefix)
  #     res = res.join(stock_df)

  # # Parabolic Stop and Reverse
  # if group_col:
  #   with Timer(f'{prefix}Parabolic Stop and Reverse'):
  #     parabolic_df = create_parabolic(group_df, val_col2, prefix)
  #     res = res.join(parabolic_df)

  # KST Oscillator
  if group_col:
    with Timer(f'{prefix}KST Oscillator'):
      kst_df = create_kst(group_df, val_col2, prefix)
      res = res.join(kst_df)

  return res


def add_feature(df):
  res = df.copy()

  # 日付
  res['Year'] = res.Date.dt.year
  res['Month'] = res.Date.dt.month
  res['Day'] = res.Date.dt.day
  res['WeekOfYear'] = res.Date.dt.isocalendar().week.astype(int)

  # ターゲットエンコーディング
  target_columns = ['Sector', 'Industry', 'List1', 'List2']
  ms = ce.MEstimateEncoder(cols=target_columns, m=1.0)
  ms_df = ms.fit_transform(res[target_columns], res.y_diff_norm)
  res['List'] = res.List1
  res.loc[:, target_columns] = ms_df.loc[:, target_columns]

  # 各種特徴量作成
  res = res.join(create_features(res, 'id', 'y_diff_norm', 'y_prev', ''))

  # 市場平均価格を求める
  prefix = 'Market_'
  tmp_market_df = res[['Date']].copy()
  tmp_market_df['y'] = np.expm1(res.y)
  market_df = tmp_market_df.groupby('Date').mean().apply(np.log1p).reset_index()
  market_df['y_prev'] = market_df['y'].transform(lambda x: x.shift(1).fillna(method='bfill'))
  market_df['y_diff'] = market_df['y'] - market_df['y_prev']
  market_df['y_diff_std'] = market_df['y'].std()
  market_df['y_diff_norm'] = market_df['y_diff'] / market_df['y_diff_std']

  market_columns = [prefix + c if c != 'Date' else c for c in market_df.columns]
  market_df.columns = market_columns

  # 市場平均の各種特徴量作成、結合
  market_df = market_df.join(create_features(market_df, None, 'Market_y_diff_norm', 'Market_y_prev', prefix))
  res = res.merge(market_df, on='Date')
  market_columns = market_df.drop('Date', axis=1).columns.values.tolist()

  # 各銘柄の特徴量を相対化
  # シャープレシオ
  # with Timer(f'Sharp Ratio'):
  #   group_df1 = res.groupby('id')
  #   group_df2 = market_df
  #   sharp_df = sharp_ratio(group_df1, group_df2, 'y_prev', [13, 26])
  #   res = res.join(sharp_df)

  # 対象列の設定
  replace_columns = []
  # replace_columns += [f'lag_{i}' for i in [1, 2, 3, 4]]
  # replace_columns += [f'rmean_{i}_{j}' for i in [1] for j in [52]]
  # replace_columns += [f'rstd_{i}_{j}' for i in [1] for j in [52]]
  # replace_columns += [f'EMA_GAP_{i}' for i in [13, 26]]
  # replace_columns += [f'Return_{i}' for i in [13, 26]]
  # replace_columns += [f'HV_{i}' for i in [13, 26]]
  # replace_columns += [f'Skew_{i}' for i in [13, 26]]
  # replace_columns += [f'Kurt_{i}' for i in [13, 26]]
  # replace_columns += [f'Var_{i}' for i in [13, 26]]

  # 市場平均価格の特徴量に対する各銘柄の特徴量を作成
  replace_columns_market = [prefix + c for c in replace_columns]
  feature_from_market = pd.DataFrame()
  for c1, c2 in zip(replace_columns, replace_columns_market):
    feature_from_market[f'{c1}_from_Market'] = res[c1] - res[c2]
  res = res.drop(replace_columns+market_columns, axis=1).join(feature_from_market).join(res[market_columns])

  # 不要な特徴量の削除
  drop_columns = []
  drop_columns += market_columns
  res.drop(drop_columns, axis=1, inplace=True, errors='ignore')

  # PCA
  with Timer('PCA'):
    res = res.join(pca(res, 2))

  # K-Means
  with Timer('K-Means'):
    res = res.join(k_means(res, 3))

  # 不要な特徴量の削除
  drop_columns = []
  drop_columns += ['EMA_GAP_26', 'PCA1', 'd', 'HV_26']
  res.drop(drop_columns, axis=1, errors='ignore', inplace=True)

  # 欠損値補完
  res.IPOyear.fillna(1970, inplace=True)
  target_columns = [x for x in res.columns.values.tolist() if any(s in x for s in ['lag', 'rmean', 'rstd'])]
  for c in target_columns:
    res[c].fillna(method='bfill', inplace=True)

  return res


price_df3 = add_feature(price_df2)
price_df3

2021-12-14 14:00:34 ラグ特徴量 実行時間: 0.15分
2021-12-14 14:00:39 単純移動平均 実行時間: 0.08分
2021-12-14 14:00:43 EMA_GAP 実行時間: 0.08分
2021-12-14 14:00:49 HV 実行時間: 0.10分
2021-12-14 14:00:51 Kurt 実行時間: 0.02分
2021-12-14 14:01:49 KST Oscillator 実行時間: 0.97分
2021-12-14 14:01:49 Market_ラグ特徴量 実行時間: 0.00分
2021-12-14 14:01:49 Market_単純移動平均 実行時間: 0.00分
2021-12-14 14:01:49 Market_EMA_GAP 実行時間: 0.00分
2021-12-14 14:01:49 Market_HV 実行時間: 0.00分
2021-12-14 14:01:49 Market_Kurt 実行時間: 0.00分
2021-12-14 14:01:57 PCA 実行時間: 0.11分
2021-12-14 14:02:16 K-Means 実行時間: 0.32分


Unnamed: 0,Date,id,y,y_prev,y_diff,y_diff_std,y_diff_norm,IPOyear,Sector,Industry,List1,List2,Year,Month,Day,WeekOfYear,List,lag_1,lag_2,lag_3,lag_4,rmean_1_52,rstd_1_52,EMA_GAP_13,HV_13,Kurt_13,Kurt_26,KST,KST_Diff,KST_Sig,PCA,PCA2,ClusterId
0,2011-11-13,VGSH,4.042036,4.042036,0.0,0.017516,0.0,1970.0,0.007648,0.007648,0.003676,0.003665,2011,11,13,45,NASDAQ,0.000000,0.000000,0.000000,0.000000,0.005307,0.037524,0.000000,0.000000,0.000000,0.000000,-5.495105,0.000000,-5.495105,-0.077640,1.878460,1
1,2011-11-13,JEF,2.925793,2.925793,0.0,0.141216,0.0,1970.0,0.006327,0.003095,0.005265,0.005255,2011,11,13,45,NYSE,0.000000,0.000000,0.000000,0.000000,0.005307,0.037524,0.000000,0.000000,0.000000,0.000000,-33.662716,0.000000,-33.662716,-0.560283,1.963261,1
2,2011-11-13,IVZ,2.692657,2.692657,0.0,0.215237,0.0,1970.0,0.006327,0.005636,0.005265,0.005255,2011,11,13,45,NYSE,0.000000,0.000000,0.000000,0.000000,0.005307,0.037524,0.000000,0.000000,0.000000,0.000000,-156.800124,0.000000,-156.800124,-0.781531,1.763871,1
3,2011-11-13,KTCC,1.752672,1.752672,0.0,0.207799,0.0,1983.0,0.003404,0.003106,0.003676,0.003665,2011,11,13,45,NASDAQ,0.000000,0.000000,0.000000,0.000000,0.005307,0.037524,0.000000,0.000000,0.000000,0.000000,-212.206689,0.000000,-212.206689,0.240175,1.420523,1
4,2011-11-13,FBZ,2.695899,2.695899,0.0,0.224060,0.0,1970.0,0.007648,0.007648,0.003676,0.003665,2011,11,13,45,NASDAQ,0.000000,0.000000,0.000000,0.000000,0.005307,0.037524,0.000000,0.000000,0.000000,0.000000,73.153084,0.000000,73.153084,0.010766,2.020492,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376755,2019-11-24,TYG,,2.884177,,0.141783,,2004.0,0.007648,0.007648,0.005265,0.005255,2019,11,24,47,NYSE,-0.087223,-0.051072,-0.207735,0.031837,-0.018841,0.221444,0.114073,0.019387,-0.267399,-1.262845,-38.823243,-15.940639,-22.882604,-0.783213,1.783268,1
1376756,2019-11-24,VIRC,,1.543298,,0.264580,,1970.0,0.005102,0.003960,0.003676,0.003665,2019,11,24,47,NASDAQ,-0.196693,-0.120716,0.059876,0.037910,-0.010175,0.153233,3.696419,0.122900,-0.233640,-0.782196,-34.104688,-24.206450,-9.898239,0.507918,1.272884,1
1376757,2019-11-24,BIS,,2.754241,,1.087817,,1970.0,0.007648,0.007648,0.003676,0.003665,2019,11,24,47,NASDAQ,-0.059202,-0.017845,-0.009060,-0.071397,-0.005512,0.064397,1.669589,0.014808,1.796140,-0.629806,1.578791,-6.811259,8.390050,-0.046868,1.668727,1
1376758,2019-11-24,WOOD,,4.184112,,0.225542,,1970.0,0.007648,0.007648,0.003676,0.003665,2019,11,24,47,NASDAQ,-0.047620,0.020729,0.060736,0.044388,0.004269,0.127733,-2.333882,0.008173,0.882411,-0.375038,4.120755,11.128808,-7.008053,-0.062332,1.661861,1


## 学習、予測

In [None]:
def build_model(params):
  tf.random.set_seed(SEED)

  dr = params['dropout_rate'] if 'dropout_rate' in params else 0
  kernel_initializer = tf.keras.initializers.GlorotUniform(seed=SEED) if 'kernel_initializer' not in params \
    else tf.keras.initializers.HeUniform(seed=SEED) if params['kernel_initializer'] == 'he_uniform' \
    else tf.keras.initializers.HeNormal(seed=SEED) if params['kernel_initializer'] == 'he_normal' \
    else tf.keras.initializers.GlorotUniform(seed=SEED)

  optimizer_func = tf.keras.optimizers.Adam if 'optimizer' not in params else tf.keras.optimizers.RMSprop if params['optimizer'] == 'RMSprop' \
    else tf.keras.optimizers.Adam if params['optimizer'] == 'Adam' else tf.keras.optimizers.Adam

  activation = params['activation'] if 'activation' in params else 'ReLU'
  use_batch_norm = params['use_batch_norm'] if 'use_batch_norm' in params else False

  def activation_layer(activation):
    return layers.ReLU() if activation == 'ReLU' else layers.PReLU() if activation == 'PReLU' else layers.Activation(activations.swish) if activation == 'Swish' else layers.ReLU()

  model = tf.keras.Sequential()
  for i, unit in enumerate(params['layers']):
    model.add(layers.Dense(unit, kernel_initializer=kernel_initializer))
    if use_batch_norm:
      model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dr, seed=SEED))
    model.add(activation_layer(activation))
  model.add(layers.Dense(1, kernel_initializer=kernel_initializer))

  if 'lr' in params:
    optimizer = optimizer_func(params['lr'])
  else:
    optimizer = optimizer_func()

  if 'optimizer' in params and params['optimizer'] == 'AMSGrad':
    optimizer.amsgrad = True

  model.compile(loss='mean_squared_error',
                optimizer=optimizer,
                metrics=['mse', 'msle'])
  
  return model

In [None]:
def train_evaluate(price_df, submission_df):
  test_size = 0.01

  # データの分離
  train_df = price_df.loc[price_df.Date < Config.submission_date].reset_index(drop=True)
  submit_df = price_df.loc[price_df.Date == Config.submission_date].reset_index(drop=True)
  original_train_df = train_df.copy()

  if test_size == 0:
    train_df = train_df.loc[train_df.List.isin(['NASDAQ', 'AMEX']), :]
  else:
    unique_id = train_df.loc[train_df.List.isin(['NASDAQ', 'AMEX']), :].id.unique()
    train_id, test_id = train_test_split(unique_id, test_size=test_size, random_state=SEED)
    train_df = train_df.loc[train_df.id.isin(train_id), :]

  useless_cols = [
    'y', 'y_prev', 'y_diff', 'y_diff_std', 'y_diff_norm', 'Date', 'id',
    'Market_y', 'Market_y_prev', 'Market_y_diff', 'Market_y_diff_std', 'Market_y_diff_norm', 'List'
  ]
  usable_cols = train_df.columns[~train_df.columns.isin(useless_cols)]
  target_col = 'y_diff_norm'

  x_train = train_df[usable_cols]
  y_train = train_df[target_col]
  x_submit = submit_df[usable_cols]

  # パラメーター
  params = defaultdict(None)
  params['NUM_ROUND'] = 30
  params['layers'] = (128, 128, 128, 128)
  params['kernel_initializer'] = 'glorot_uniform'
  params['use_batch_norm'] = True
  params['dropout_rate'] = 0.5
  params['activation'] = 'ReLU'
  params['BATCH_SIZE'] = 32
  steps_per_epoch = len(x_train) // params['BATCH_SIZE']
  params['lr'] = tfa.optimizers.CyclicalLearningRate(
      initial_learning_rate=0.001,
      maximal_learning_rate=0.001,
      scale_fn=lambda x: 1/(2.**(x-1)),
      step_size=2 * steps_per_epoch
  )
  params['optimizer'] = 'AMSGrad'
  es = tf.keras.callbacks.EarlyStopping(monitor='mse', patience=2, mode='min')

  # 学習
  y_diff_std = train_df['y_diff_std']
  y_submit = []

  unique_id = train_df.id.unique()
  kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
  for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_id)):
    print(f'fold: {fold+1} start.')

    tr_groups, va_groups = unique_id[tr_group_idx], unique_id[va_group_idx]
    tr_idx = train_df.id.isin(tr_groups)
    vl_idx = train_df.id.isin(va_groups)

    x_tr_fold = x_train[tr_idx]
    y_tr_fold = y_train[tr_idx]
    x_vl_fold = x_train[vl_idx]
    y_vl_fold = y_train[vl_idx]

    model = build_model(params)
    history = model.fit(
      x_tr_fold,
      y_tr_fold,
      epochs=params['NUM_ROUND'], 
      validation_data=(x_vl_fold, y_vl_fold),
      shuffle=True,
      callbacks=[es],
      verbose=1,
      batch_size=params['BATCH_SIZE'],
    )

    y_submit.append(model.predict(x_submit))

  # submissionデータ作成
  submission_df['y'] = submit_df['y']
  submission_df['y_prev'] = submit_df['y_prev']
  submission_df['y_diff_std'] = submit_df['y_diff_std']
  submission_df['y_diff_norm'] = submit_df['y_diff_norm']
  submission_df['pred_y'] = np.mean(y_submit, axis=0).flatten() * submit_df['y_diff_std'].values + submit_df['y_prev'].values
  submission_df['exp_pred_y'] = np.expm1(submission_df['pred_y'])
  submission_df.loc[submission_df['exp_pred_y'] < 0, 'exp_pred_y'] = 0
  submission_df['pred_y_diff_norm'] = np.mean(y_submit, axis=0).flatten()
  submission_df['List'] = submit_df['List']
  submission_df['List1'] = submit_df['List1']

  return model, submission_df


model, submission_df = train_evaluate(price_df3, submission_df)

## submissionファイル作成

In [None]:
submission_df2 = submission_df[['id', 'exp_pred_y']]
submission_df2.columns = ['id', 'y']

In [None]:
submission_df2.to_csv(f'{Config.intermediate_dir_name}/submission_mlp_NASDAQ.csv', index=False)