In [None]:
from pandas import DataFrame
from pandas import Series
import pandas as pd
import numpy as np


class DataPreparer:
    normalizedData: DataFrame
    differentiatedData: DataFrame
    structuredData: DataFrame
    stds: Series
    means: Series

    def __init__(self, structuredData: DataFrame, trainMaxIndex):
        self.structuredData = structuredData
        self.trainMaxIndex = trainMaxIndex
        self.stds = Series(dtype='float64')
        self.means = Series(dtype='float64')

    def prepareData(self):
        self.__differentiateData()
        self.__normalizeData()

        return self.normalizedData

    def restoreData(self, predicts: Series, previousRow: Series):
        resultDict = {}
        for orgNumber in predicts.index:
            prevValue = previousRow[orgNumber]
            deviation = predicts[orgNumber]
            # deviation = deviation * self.stds[orgNumber]
            # deviation = deviation + self.means[orgNumber]
            newValue = prevValue * (deviation + 1)
            resultDict[orgNumber] = newValue

        resultSeries = Series(resultDict)
        return resultSeries

    def __differentiateData(self):
        dataDf = pd.DataFrame.copy(self.structuredData)
        dataDf = dataDf.pct_change()
        dataDf = dataDf.fillna(0)
        self.differentiatedData = dataDf

    def __normalizeData(self):
        self.normalizedData = pd.DataFrame.copy(self.differentiatedData)
        return

        mean = self.differentiatedData[:self.trainMaxIndex].mean(axis=0)
        dataDf = self.differentiatedData - mean
        std = self.differentiatedData.std(axis=0, ddof=0)
        dataDf = dataDf / std
        self.normalizedData = dataDf
        self.means = mean
        self.stds = std

In [None]:
import pandas as pd
from pandas import DataFrame
import os.path


class DataExtractor:
    rawExtData: DataFrame
    mainDataFile = '/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv'
    extDataFile = '/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv'
    structuredDataFile = 'data_close.csv'

    def getRawExt(self):
        rawExtData = pd.read_csv(self.extDataFile, index_col=0)
        return rawExtData

    def getStructuredData(self):
#         if os.path.exists(self.structuredDataFile):
#             structuredData = pd.read_csv(self.structuredDataFile, index_col=0)
#             return structuredData

        rawData = pd.read_csv(self.mainDataFile, index_col=0)
        updates = pd.read_csv(self.extDataFile, index_col=0)
        rawData = pd.concat([rawData, updates], axis=0)

        vocStructuredData = {}

        for securitiesCode in rawData['SecuritiesCode'].unique():
            firmSeries = rawData[rawData['SecuritiesCode'] == securitiesCode]
            date = firmSeries['Date']
            close = firmSeries['Close']
            close.index = date
            vocStructuredData[securitiesCode] = close
            continue

        structuredData = DataFrame(vocStructuredData)
#         structuredData.to_csv(self.structuredDataFile)

        return structuredData

In [None]:
from pandas import DataFrame
import numpy as np


class DataGenerator:

    def __init__(self, preparedData: DataFrame, lookBack: int, batchSize: int, trainMaxIndex: int, countTests: int):
        self.preparedData = preparedData
        self.lookBack = lookBack
        self.batchSize = batchSize
        self.trainMaxIndex = trainMaxIndex
        self.countTests = countTests

    def getTrainArray(self):
        trainGen = self.getTrainGen()
        trainSteps, valSteps, testSteps = self.getStepsCounts()
        X, y = self.__getArray(trainGen, trainSteps)

        return X, y

    def getValArray(self):
        valGen = self.getValGen()
        trainSteps, valSteps, testSteps = self.getStepsCounts()
        X, y = self.__getArray(valGen, valSteps)

        return X, y

    def getTestArray(self):
        testGen = self.getValGen()
        trainSteps, valSteps, testSteps = self.getStepsCounts()
        X, y = self.__getArray(testGen, testSteps)

        return X, y

    def __getArray(self, generator, countSteps):
        data = self.preparedData
        lookBack = self.lookBack
        X = np.zeros((self.batchSize * countSteps, lookBack, data.shape[-1]))
        y = np.zeros((self.batchSize * countSteps, data.shape[-1]))
        currentRowX = 0
        currentRowY = 0
        for i in range(countSteps):
            samples, targets = next(generator)
            for sampleItem in samples:
                X[currentRowX] = sampleItem
                currentRowX += 1
            for targetItem in targets:
                y[currentRowY] = targetItem
                currentRowY += 1

        return X, y

    def getTrainGen(self):
        shuffle = False
        minIndex = 0
        maxIndex = self.trainMaxIndex
        return self.getGenerator(minIndex, maxIndex, shuffle)

    def getValGen(self):
        shuffle = False
        minIndex = self.trainMaxIndex + 1
        maxIndex = self.trainMaxIndex + self.countTests
        return self.getGenerator(minIndex, maxIndex, shuffle)

    def getTestGen(self):
        shuffle = False
        minIndex = self.trainMaxIndex + self.countTests + 1
        maxIndex = None
        return self.getGenerator(minIndex, maxIndex, shuffle)

    def getStepsCounts(self):
        trainSteps = (self.trainMaxIndex - self.lookBack) // self.batchSize
        valSteps = (self.countTests - self.lookBack) // self.batchSize
        testSteps = (len(self.preparedData) - self.trainMaxIndex - self.countTests - self.lookBack) // self.batchSize

        return trainSteps, valSteps, testSteps

    def getGenerator(self, minIndex, maxIndex, shuffle):
        data = self.preparedData
        lookBack = self.lookBack
        batchSize = self.batchSize
        if maxIndex is None:
            maxIndex = len(data) - 1
        i = minIndex + lookBack
        while 1:
            if shuffle:
                rows = np.random.randint(minIndex + lookBack, maxIndex, size=batchSize)
            else:
                if i + batchSize >= maxIndex:
                    i = minIndex + lookBack
                rows = np.arange(i, min(i + batchSize, maxIndex))
                i += len(rows)

            samples = np.zeros((len(rows), lookBack, data.shape[-1]))
            targets = np.zeros((len(rows), data.shape[-1]))

            for j, row in enumerate(rows):
                indices = range(rows[j] - lookBack, rows[j], 1)
                samples[j] = data.iloc[indices]
                targets[j] = data.iloc[rows[j]]

            yield samples, targets

In [None]:
import numpy as np
from pandas import Series


class NaivePredictor:
    stds: Series
    means: Series

    def __init__(self, valSteps, valGen, stds, means):
        self.valSteps = valSteps
        self.valGen = valGen
        self.stds = stds
        self.means = means

    def evaluate(self):
        batchMaes = []
        for step in range(self.valSteps):
            samples, targets = next(self.valGen)
            zeroSamplesLine = np.zeros(targets.shape[-1])
            # zeroSamplesLine -= self.means
            # zeroSamplesLine /= self.stds
            zeroSamples = np.zeros(targets.shape)
            for i in range(zeroSamples.shape[0]):
                zeroSamples[i] = zeroSamplesLine
            mae = np.mean(np.abs(zeroSamples - targets))
            batchMaes.append(mae)

        return np.mean(batchMaes)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import callbacks
import datetime


class GruPredictorArray:
    def makeModelAndFit(self, floatData, X, y, X_val, y_val, numEpochs, batchSize):
        model = Sequential()
        model.add(layers.GRU(4, input_shape=(None, floatData.shape[-1])))
        model.add(layers.Dense(floatData.shape[-1]))

        now = datetime.datetime.now()
        timeStr = now.isoformat().replace(':', '-')

        callbacksList = [
            callbacks.ModelCheckpoint(filepath='trained.h5', monitor='val_loss', save_best_only=True)
        ]

        optimizer = RMSprop()
        model.compile(optimizer=optimizer, loss='mae', metrics=['acc'])
        print(model.summary())

        history = model.fit(X, y, epochs=numEpochs, batch_size=batchSize, callbacks=callbacksList, validation_data=(X_val, y_val))

        return history

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model

pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 200)

np.set_printoptions(linewidth=75, formatter=dict(float=lambda x: "%.3g" % x))

In [None]:
trainMaxIndex = 1100
countTests = 158
lookBack = 60
dataExtractor = DataExtractor()
structuredData = dataExtractor.getStructuredData()
# print(structuredData)

dataPreparer = DataPreparer(structuredData, trainMaxIndex)
preparedData = dataPreparer.prepareData()
# print(preparedData)

In [None]:
batchSize = 25
dataGenerator = DataGenerator(preparedData=preparedData, lookBack=lookBack, batchSize=batchSize, trainMaxIndex=trainMaxIndex, countTests=countTests)
X, y = dataGenerator.getTrainArray()
X_val, y_val = dataGenerator.getValArray()

In [None]:
valGen = dataGenerator.getValGen()
trainSteps, valSteps, testSteps = dataGenerator.getStepsCounts()
predictor = NaivePredictor(valSteps, valGen, dataPreparer.stds, dataPreparer.means)
maeNaive = predictor.evaluate()
print(maeNaive)

In [None]:
numEpochs = 10
predictor = GruPredictorArray()
history = predictor.makeModelAndFit(preparedData, X, y, X_val, y_val, numEpochs, batchSize)

In [None]:
loss = history.history['loss']
valLoss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, valLoss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

print('\n')
print(f'MAE naive: {maeNaive:.4f}')
print(f'MAE Sequential: {min(valLoss):.4f}')

In [None]:
rawExtData = dataExtractor.getRawExt()
model = load_model('trained.h5')

ranksDict = {'Date': rawExtData.Date, 'SecuritiesCode': rawExtData.SecuritiesCode, 'Rank': 0}
ranks = pd.DataFrame(ranksDict, index=rawExtData.index)

dates = rawExtData['Date'].unique()

allDeltas = pd.DataFrame()
allRanks = pd.DataFrame()

for currDate in dates:
    endRowIndex, = np.where(preparedData.index.values == currDate)

    inputDataDf = preparedData.iloc[endRowIndex[0] - lookBack:endRowIndex[0]]
    inputData = inputDataDf.values
    inputData = inputData.reshape(1, inputData.shape[0], inputData.shape[1])

    lastRow = model.predict(inputData)
    lastRow = lastRow[0]
    lastRow = pd.Series(lastRow)

    previousRow = structuredData.iloc[endRowIndex[0] - 1]
    lastRow.index = previousRow.index

    restoredRow = dataPreparer.restoreData(lastRow, previousRow)

    # testRow = structuredData.iloc[endRowIndex[0]]
    # print(testRow)
    # print(restoredRow)
    # testRow[:100].plot(style="b")
    # restoredRow[:100].plot(style="r")
    # plt.show()

    delta = (restoredRow - previousRow) / previousRow

    orgDict = {}
    for orgId in restoredRow.index:
        orgDict[orgId] = delta[orgId]

    deltas = pd.DataFrame(orgDict, index=[currDate])

    allDeltas = pd.concat([allDeltas, deltas], axis=0)

    # print(deltas)

    orgsSorted = deltas.T.sort_values(by=currDate, ascending=False)

    # print(orgsSorted)
    # print(orgsSorted.index[3])
    # print(orgsSorted.index.size)
    rankDf = deltas
    for rankValue in range(orgsSorted.index.size):
        orgId = orgsSorted.index[rankValue]
        rankDf[orgId][currDate] = rankValue

    allRanks = pd.concat([allRanks, rankDf], axis=0)

print(allDeltas)
print(allRanks)

# allRanks.to_csv('all_ranks.csv')

resultDf = pd.DataFrame()

for date in allRanks.index:
    currRow = allRanks.loc[date]
    for orgId in currRow.index:
        rankValue = currRow[orgId]
        rowItem = pd.DataFrame({'Date': [date], 'SecuritiesCode': [orgId], 'Rank': [rankValue]})
        resultDf = pd.concat([resultDf, rowItem])

print(resultDf)
resultDf.to_csv('submission2.csv', index=False)

In [None]:
import pandas as pd
predictsDf = pd.read_csv('submission2.csv')
print(predictsDf)

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
#     sample_prediction_df['Rank'] = predictsDf['Rank']  # make your predictions here
    env.predict(predictsDf)   # register your predictions
