In [None]:
# talib install
!pip install ../input/talib-source/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
import talib as ta

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
from lightgbm import LGBMRegressor
import optuna.integration.lightgbm as lgb
import matplotlib.pyplot as plt
import talib
import warnings

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

warnings.filterwarnings("ignore")

_fit_columns_base = ["DateInt", "SecuritiesCode", "High","Open","Close","Low","Volume"]
_fit_columns_ta = [
    'BBANDS_upperband', 'BBANDS_middleband',
    'BBANDS_lowerband', 'DEMA', 'EMA', 'HT_TRENDLINE', 'KAMA', 'MA',
    'MIDPOINT', 'SMA', 'T3', 'TRIMA', 'WMA', 'ADX', 'ADXR', 'APO',
    'AROON_aroondown', 'AROON_aroonup', 'AROONOSC', 'BOP', 'CCI', 'DX',
    'MACD_macd', 'MACD_macdsignal', 'MACD_macdhist', 'MFI', 'MINUS_DI',
    'MINUS_DM', 'MOM', 'PLUS_DI', 'PLUS_DM', 'RSI', 'STOCH_slowk',
    'STOCH_slowd', 'STOCHF_fastk', 'STOCHF_fastd', 'STOCHRSI_fastk',
    'STOCHRSI_fastd', 'ULTOSC', 'WILLR', 'AD', 'ADOSC', 'OBV',
    'ATR', 'NATR', 'TRANGE', 'HT_DCPERIOD', 'HT_DCPHASE',
    'HT_PHASOR_inphase', 'HT_PHASOR_quadrature', 'HT_SINE_sine',
    'HT_SINE_leadsine', 'HT_TRENDMODE', 'BETA', 'CORREL', 'LINEARREG',
    'LINEARREG_ANGLE', 'LINEARREG_INTERCEPT', 'LINEARREG_SLOPE', 'STDDEV',
    # 'TEMA', 'TRIX',
]
fit_columns = _fit_columns_base + _fit_columns_ta

def add_technical(df):
    op = df['Open']
    hi = df['High']
    lo = df['Low']
    cl = df['Close']
    volume = df['Volume']
    hilo = (hi + lo) / 2

    # print('calc ta overlap')
    df['BBANDS_upperband'], df['BBANDS_middleband'], df['BBANDS_lowerband'] = ta.BBANDS(cl, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
    df['BBANDS_upperband'] -= hilo
    df['BBANDS_middleband'] -= hilo
    df['BBANDS_lowerband'] -= hilo
    df['DEMA'] = ta.DEMA(cl, timeperiod=30) - hilo
    df['EMA'] = ta.EMA(cl, timeperiod=30) - hilo
    df['HT_TRENDLINE'] = ta.HT_TRENDLINE(cl) - hilo
    df['KAMA'] = ta.KAMA(cl, timeperiod=30) - hilo
    df['MA'] = ta.MA(cl, timeperiod=30, matype=0) - hilo
    df['MIDPOINT'] = ta.MIDPOINT(cl, timeperiod=14) - hilo
    df['SMA'] = ta.SMA(cl, timeperiod=30) - hilo
    df['T3'] = ta.T3(cl, timeperiod=5, vfactor=0) - hilo
    # df['TEMA'] = ta.TEMA(cl, timeperiod=30) - hilo
    df['TRIMA'] = ta.TRIMA(cl, timeperiod=30) - hilo
    df['WMA'] = ta.WMA(cl, timeperiod=30) - hilo

    # print('calc ta momentum')
    df['ADX'] = ta.ADX(hi, lo, cl, timeperiod=14)
    df['ADXR'] = ta.ADXR(hi, lo, cl, timeperiod=14)
    df['APO'] = ta.APO(cl, fastperiod=12, slowperiod=26, matype=0)
    df['AROON_aroondown'], df['AROON_aroonup'] = ta.AROON(hi, lo, timeperiod=14)
    df['AROONOSC'] = ta.AROONOSC(hi, lo, timeperiod=14)
    df['BOP'] = ta.BOP(op, hi, lo, cl)
    df['CCI'] = ta.CCI(hi, lo, cl, timeperiod=14)
    df['DX'] = ta.DX(hi, lo, cl, timeperiod=14)
    df['MACD_macd'], df['MACD_macdsignal'], df['MACD_macdhist'] = ta.MACD(cl, fastperiod=12, slowperiod=26, signalperiod=9)
    # skip MACDEXT MACDFIX
    df['MFI'] = ta.MFI(hi, lo, cl, volume, timeperiod=14)
    df['MINUS_DI'] = ta.MINUS_DI(hi, lo, cl, timeperiod=14)
    df['MINUS_DM'] = ta.MINUS_DM(hi, lo, timeperiod=14)
    df['MOM'] = ta.MOM(cl, timeperiod=10)
    df['PLUS_DI'] = ta.PLUS_DI(hi, lo, cl, timeperiod=14)
    df['PLUS_DM'] = ta.PLUS_DM(hi, lo, timeperiod=14)
    df['RSI'] = ta.RSI(cl, timeperiod=14)
    df['STOCH_slowk'], df['STOCH_slowd'] = ta.STOCH(hi, lo, cl, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    df['STOCHF_fastk'], df['STOCHF_fastd'] = ta.STOCHF(hi, lo, cl, fastk_period=5, fastd_period=3, fastd_matype=0)
    df['STOCHRSI_fastk'], df['STOCHRSI_fastd'] = ta.STOCHRSI(cl, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    # df['TRIX'] = ta.TRIX(cl, timeperiod=30)
    df['ULTOSC'] = ta.ULTOSC(hi, lo, cl, timeperiod1=7, timeperiod2=14, timeperiod3=28)
    df['WILLR'] = ta.WILLR(hi, lo, cl, timeperiod=14)

    # print('calc ta volume')
    df['AD'] = ta.AD(hi, lo, cl, volume)
    df['ADOSC'] = ta.ADOSC(hi, lo, cl, volume, fastperiod=3, slowperiod=10)
    df['OBV'] = ta.OBV(cl, volume)

    # print('calc ta vola')
    df['ATR'] = ta.ATR(hi, lo, cl, timeperiod=14)
    df['NATR'] = ta.NATR(hi, lo, cl, timeperiod=14)
    df['TRANGE'] = ta.TRANGE(hi, lo, cl)

    # print('calc ta cycle')
    df['HT_DCPERIOD'] = ta.HT_DCPERIOD(cl)
    df['HT_DCPHASE'] = ta.HT_DCPHASE(cl)
    df['HT_PHASOR_inphase'], df['HT_PHASOR_quadrature'] = ta.HT_PHASOR(cl)
    df['HT_SINE_sine'], df['HT_SINE_leadsine'] = ta.HT_SINE(cl)
    df['HT_TRENDMODE'] = ta.HT_TRENDMODE(cl)

    # print('calc ta stats')
    df['BETA'] = ta.BETA(hi, lo, timeperiod=5)
    df['CORREL'] = ta.CORREL(hi, lo, timeperiod=30)
    df['LINEARREG'] = ta.LINEARREG(cl, timeperiod=14) - cl
    df['LINEARREG_ANGLE'] = ta.LINEARREG_ANGLE(cl, timeperiod=14)
    df['LINEARREG_INTERCEPT'] = ta.LINEARREG_INTERCEPT(cl, timeperiod=14) - cl
    df['LINEARREG_SLOPE'] = ta.LINEARREG_SLOPE(cl, timeperiod=14)
    df['STDDEV'] = ta.STDDEV(cl, timeperiod=5, nbdev=1)

    return df


# Load csv

In [None]:
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
prices = prices[['RowId', 'Date', 'SecuritiesCode', 'Open', 'High',	'Low', 'Close', 'Volume', 'Target']]
prices = prices.dropna(axis=0) # delete nan (112000 => 111716)
prices

# Add feature

In [None]:
prices = prices.groupby('SecuritiesCode').apply(add_technical).dropna(axis=0)
prices
print(len(prices))
display((pd.isna(prices).sum() / len(prices) * 100).to_frame(name='NaN Ratio').T)

# Fit

In [None]:
prices.Date = pd.to_datetime(prices.Date)
prices['DateInt'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
X = prices[fit_columns]
y = prices[["Target"]]
codes = X.SecuritiesCode.unique()

In [None]:
model_o = LGBMRegressor(learning_rate=0.6818202991034834, max_bin=95, n_estimators=655, num_leaves=1263, random_seed=0)
model_o.fit(X, y)

# Predict

**There is a bug that causes an error when sending LB. It will be fixed soon.**

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

past_df = prices.copy()

for i, (_prices, options, financials, trades, secondary_prices, sample_prediction) in enumerate(iter_test):
    current_date = _prices["Date"].iloc[0]
    print(f"current_date: {current_date}")

    if i == 0:
        past_df = past_df.loc[past_df["Date"] < current_date]
    
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    print(f"threshold: {threshold}")
    past_df = past_df.loc[past_df["Date"] >= threshold]
    
    _prices['Date'] = pd.to_datetime(_prices['Date'])
    _prices['DateInt'] = _prices['Date'].dt.strftime("%Y%m%d").astype(int)

    past_df = pd.concat([past_df, _prices]).reset_index()

    past_df = past_df.groupby('SecuritiesCode').apply(add_technical)

    df = past_df.query(f'Date == "{current_date}"')

    sample_prediction["Prediction"] = model_o.predict(df[fit_columns])

    sample_prediction = sample_prediction.sort_values(by="Prediction", ascending=False).drop_duplicates(subset=['SecuritiesCode'])
    sample_prediction.Rank = np.arange(0,2000)

    sample_prediction = sample_prediction.sort_values(by="SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"], axis=1)

    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)