In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt


class DataExtractor:
    __trainDataFile = '../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv'
    __testDataFile = '../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv'
    __normalizedTrainDf: pd.DataFrame
    __normalizedTrainTargetsSr: pd.Series
    __normalizedTestDf: pd.DataFrame
    __normalizedTestTargetsSr: pd.Series
    meansX: pd.Series
    stdsX: pd.Series
    meansY: float
    stdsY: float

    def __init__(self, trainColumns):
        # self.__normalizedTrainDf, self.__normalizedTrainTargetsSr = self.__buildNormalized(self.__trainDataFile, trainColumns)
        # self.__normalizedTestDf, self.__normalizedTestTargetsSr = self.__buildNormalized(self.__testDataFile, trainColumns)

        rawTrainDf, targets = self.__getRaw(self.__trainDataFile, trainColumns)        
        meansTrain = rawTrainDf.mean(axis=0)
        # print(meansTrain)
        self.meansX = meansTrain
        stdsTrain = rawTrainDf.std(axis=0, ddof=0)
        # print(stdsTrain)
        self.stdsX = stdsTrain
        normalizedTrainDf = rawTrainDf - meansTrain
        normalizedTrainDf /= stdsTrain
        self.__normalizedTrainDf = normalizedTrainDf

        meansTrainTargets = targets.mean()
        # print(meansTrainTargets)
        self.meansY = meansTrainTargets
        stdsTrainTargets = targets.std(ddof=0)
        # print(stdsTrainTargets)
        self.stdsY = stdsTrainTargets
        normalizedTrainTargets = targets - meansTrainTargets
        normalizedTrainTargets /= stdsTrainTargets
        self.__normalizedTrainTargetsSr = normalizedTrainTargets

        rawTestDf, targetsTest = self.__getRaw(self.__testDataFile, trainColumns)
        normalizedTestDf = rawTestDf - meansTrain
        normalizedTestDf /= stdsTrain
        self.__normalizedTestDf = normalizedTestDf

        normalizedTestTargets = targetsTest - meansTrainTargets
        normalizedTestTargets /= stdsTrainTargets
        self.__normalizedTestTargetsSr = normalizedTestTargets

    def getTrainX(self):
        trainX = self.__normalizedTrainDf
        return trainX

    def getTrainY(self):
        trainY = self.__normalizedTrainTargetsSr
        return trainY

    def getTestX(self):
        trainX = self.__normalizedTestDf
        return trainX

    def getTestY(self):
        trainY = self.__normalizedTestTargetsSr
        return trainY



    def __getRaw(self, dataFile, trainColumns):
        rawTrainDf = pd.read_csv(dataFile, index_col=0)
        rawTrainDf = rawTrainDf[rawTrainDf.Date>="2021-10-01"]
        rawTrainDf = rawTrainDf.fillna(0)

        dates = [pd.to_datetime(date, errors='coerce') for date in rawTrainDf['Date']]
        dates = [date.timestamp() for date in dates]
        datesDf = pd.DataFrame(dates, columns=['Timestamp'], index=rawTrainDf.index)
        rawTrainDf = pd.concat([rawTrainDf, datesDf], axis=1)
        rawTrainDf.index = rawTrainDf['Timestamp']
        targets = rawTrainDf['Target']
        rawTrainDf = rawTrainDf[trainColumns]

        # means = rawTrainDf.mean(axis=0)
        # stds = rawTrainDf.std(axis=0, ddof=0)
        # normalizedTrainDf = rawTrainDf - means
        # normalizedTrainDf /= stds

        return rawTrainDf, targets


In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 200)

np.set_printoptions(linewidth=75, formatter=dict(float=lambda x: "%.3g" % x))

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

dataExtractor = DataExtractor(['SecuritiesCode', 'Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp'])

trainX = dataExtractor.getTrainX()
trainY = dataExtractor.getTrainY()
testX = dataExtractor.getTestX()
testY = dataExtractor.getTestY()

print(trainX)
print(trainY)
print(testX)
print(testY)

# naivePredictor = NaivePredictor()
# maeNaive = naivePredictor.evaluate(testY)
# print(f'Naive MAE: {maeNaive:.10f}')

regressor = LGBMRegressor(n_estimators=500, max_depth=500, num_leaves=400, learning_rate=0.000001, boosting_type='gbdt')
regressor.fit(trainX, trainY)
predict = regressor.predict(testX)
mae = mean_absolute_error(testY, predict)
print(f'LGBM MAE:  {mae:.12f}')
# print(f'nEstimators: {nEstimators}; lr: {lr:.6f}; mae: {mae:.7f}')
# regressor.booster_.save_model('../input/LGBM-models-2/model')

In [None]:
import lightgbm
import jpx_tokyo_market_prediction
import pandas as pd
import numpy as np

# regressor = lightgbm.Booster(model_file='../input/LGBM-models-2/model')

trainColumns = ['SecuritiesCode', 'Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
# dataExtractor = DataExtractor(trainColumns)

env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    rawTrainDf = prices.fillna(0)

    dates = [pd.to_datetime(date, errors='coerce') for date in rawTrainDf['Date']]
    dates = [date.timestamp() for date in dates]
    datesDf = pd.DataFrame(dates, columns=['Timestamp'], index=rawTrainDf.index)
    rawTrainDf = pd.concat([rawTrainDf, datesDf], axis=1)
    rawTrainDf.index = rawTrainDf['Timestamp']
    rawTrainDf = rawTrainDf[trainColumns]

    meansTrain = dataExtractor.meansX
    stdsTrain = dataExtractor.stdsX
    normalizedTrainDf = rawTrainDf - meansTrain
    normalizedTrainDf /= stdsTrain

    predicts = regressor.predict(normalizedTrainDf)

    sample_prediction['Target'] = predicts
    sample_prediction = sample_prediction.sort_values(by="Target", ascending=False)
    sample_prediction.Rank = np.arange(0, 2000)
    sample_prediction = sample_prediction.sort_values(by="SecuritiesCode", ascending=True)
    sample_prediction = sample_prediction[["Date", "SecuritiesCode", "Rank"]]
    sample_prediction['Rank'] = [int(item) for item in sample_prediction['Rank']]

    print(sample_prediction)
    env.predict(sample_prediction)