In [None]:
# imports
%matplotlib inline
import pandas as pd
from lightgbm import LGBMRegressor
import xgboost as xgb
import optuna.integration.lightgbm as lgb
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error
import jpx_tokyo_market_prediction

sns.set_context("notebook")

In [None]:
# load stock price train and test dataset
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
options = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv")
sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv")
supplemental_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
supplemental_sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv")
testprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")
teststockprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv")

prices = prices.append(sprices,ignore_index=True)
prices = prices.append(supplemental_prices,ignore_index=True)
prices = prices.append(supplemental_sprices,ignore_index=True)
prices = prices.drop(['RowId', 'ExpectedDividend'],axis=1)
prices = prices.dropna()
prices['SupervisionFlag'] = prices['SupervisionFlag'].map({True: 1, False: 0})

# df_test = df_test.drop(['RowId','ExpectedDividend'],axis=1)
# df_test = df_test.dropna()
# df_test['ExpectedDividend'] = df_test['ExpectedDividend'].fillna(0)
# df_test['SupervisionFlag'] = df_test['SupervisionFlag'].map({True: 1, False: 0})

In [None]:
prices["Date"] =  pd.to_datetime(prices['Date'], infer_datetime_format=True)
xprices=prices[prices['Date']<'2022-01-01']
yprices=prices[prices['Date']>='2022-01-01']

In [None]:
print(prices["Date"].max(), prices["Date"].min())

In [None]:
def featuring(train):
    dfa=pd.DataFrame()
    for code in train['SecuritiesCode'].unique():
        df=train[train['SecuritiesCode']==code]

        df=df.sort_values(by=['Date'], ascending=True)
        df['RA_20'] = df.Close.rolling(5, min_periods=1).mean()
        df['RA_40'] = df.Close.rolling(10, min_periods=1).mean()
        df['RA_60'] = df.Close.rolling(15, min_periods=1).mean()
        df['RA_80'] = df.Close.rolling(20, min_periods=1).mean()
        df['RA_100'] = df.Close.rolling(30, min_periods=1).mean()
        dfa=pd.concat([dfa, df])
    dfa['year']=pd.to_numeric(dfa['Date'].dt.year).astype(float)
    dfa['month']=pd.to_numeric(dfa['Date'].dt.month).astype(float)
    dfa['day']=pd.to_numeric(dfa['Date'].dt.day).astype(float)
    dfa['delta']=pd.to_numeric(dfa['High']-dfa['Low']).astype(float)
    dfa['change']=pd.to_numeric(dfa['Close']-dfa['Open']).astype(float)
    dfa=dfa[['Date','SecuritiesCode','delta','change','RA_20','RA_40','RA_60','RA_80','RA_100','year','month','day']]
    train=train.merge(dfa,how='left',on=['Date','SecuritiesCode'],suffixes=('', 'b')).set_axis(train.index)
    train=train.drop(['Date'],axis=1)

    return train

In [None]:
y_train = xprices.pop("Target")
y_val = yprices.pop("Target")
X_train = featuring(xprices)
X_val = featuring(yprices)

In [None]:
# https://xgboost.readthedocs.io/en/stable/parameter.html
def objective(trial, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 400, 700, 800, 1000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', low=1, high=300),
        'max_bin': trial.suggest_categorical('max_bin', [128,256,512]),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'gamma': trial.suggest_categorical('gamma', [0, 1e-4, 1e-3]),
        'missing': trial.suggest_categorical('missing', [-999])
    }

    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],early_stopping_rounds=100, verbose=False)
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds,squared=False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best_trial = study.best_trial.params
model = xgb.XGBRegressor(**best_trial)
model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_val, y_val)], verbose=1)

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
# test loop ala submission
for (df_test, options, financials, trades, secondary_prices, df_pred) in iter_test:
    df_test=df_test.drop(['RowId', 'ExpectedDividend'],axis=1)
    df_test['SupervisionFlag'] = df_test['SupervisionFlag'].map({True: 1, False: 0})
    df_test["Date"] =  pd.to_datetime(df_test['Date'], infer_datetime_format=True)
    x_test = featuring(df_test)
    y_pred = model.predict(x_test)
    df_pred['Target'] = y_pred
    df_pred = df_pred.sort_values(by = "Target", ascending = False)
    df_pred['Rank'] = np.arange(len(df_pred.index))
    df_pred = df_pred.sort_values(by = "SecuritiesCode", ascending = True)
    df_pred.drop(["Target"], axis = 1)
    submission = df_pred[["Date", "SecuritiesCode", "Rank"]]    
    env.predict(submission)