In [None]:
!pip install ../input/pfpacgake/pf/pytorch_forecasting-0.10.1-py3-none-any.whl > /dev/null

In [None]:
!cp -r ../input/tapackage/ta/ .
%cd ./ta/ta-0.10.1/ta-0.10.1
!python setup.py install
%cd /kaggle/working/

In [None]:
import warnings, gc
import numpy as np 
import pandas as pd
import matplotlib.colors
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error
from lightgbm import LGBMRegressor
from decimal import ROUND_HALF_UP, Decimal

import os
from decimal import ROUND_HALF_UP, Decimal

import pickle
from lightgbm import LGBMRegressor
from tqdm import tqdm

import ta
import matplotlib.pyplot as plt
import matplotlib as mpl

from ta import add_all_ta_features
from ta.utils import dropna

from pytorch_forecasting.data import (
    TimeSeriesDataSet,
    GroupNormalizer
)

from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import copy
from pathlib import Path
import warnings

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting.data.encoders import NaNLabelEncoder

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=UserWarning)

mpl.style.use('seaborn')


warnings.filterwarnings("ignore")
import plotly.figure_factory as ff

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code].copy()
    
    # Adds all 42 features
    feats = ta.add_all_ta_features(
        feats, "Open", "High", "Low", close_col, "Volume", fillna=False
    )
    
    # To only add specific features
    # Example: https://github.com/bukosabino/ta/blob/master/examples_to_use/bollinger_band_features_example.py
    # df['bb_bbm'] = indicator_bb.bollinger_mavg()
    # df['bb_bbh'] = indicator_bb.bollinger_hband()
    # df['bb_bbl'] = indicator_bb.bollinger_lband()
    
    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            #print(labels.loc[:, "SecuritiesCode"])
            #print(feats.loc[:, "SecuritiesCode"])

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
with open('../input/sortedfeatures/sorted_features.pkl', 'rb') as f:
  features = pickle.load(f)

In [None]:
best_model_path = '../input/lastmodel/epoch-18-step-134577.ckpt'
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
df_price = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
stock_list = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')
df_price = df_price.merge(stock_list[['SecuritiesCode', 'Section/Products', 'NewMarketSegment', '33SectorName', '17SectorCode', 'NewIndexSeriesSize', 'NewIndexSeriesSizeCode']], on='SecuritiesCode')

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    print(counter)
    current_date = prices["Date"].iloc[0]
    if counter == 0:
        df_price_raw = df_price.loc[(df_price["Date"] < current_date) & (df_price['Date'] > '2021-01-01')]  #df_price.loc[df_price["Date"] < current_date]
    df_price_raw = pd.concat([df_price_raw, prices]).reset_index(drop=True)
    df_price_test = adjust_price(df_price_raw)
    
    df_price_test['Date'] = df_price_test.index
    df_price_test["time_idx"] = df_price_test['Date'].apply(lambda x: x.year * 365 + x.month * 30 + x.day)
    df_price_test["time_idx"] -= df_price_test["time_idx"].min()
    
    buff = []
    codes = sorted(df_price_test["SecuritiesCode"].unique())
    #break
    for code in tqdm(codes):
        feat = get_features_for_predict(df_price_test, code)
        buff.append(feat)
    feat = pd.concat(buff)
    
    feat = feat[feat['time_idx'] - 21 > 0]
    #feat = feat.merge(stock_list[['SecuritiesCode', 'Section/Products', 'NewMarketSegment', '33SectorName', '17SectorCode', 'NewIndexSeriesSize', 'NewIndexSeriesSizeCode']], on='SecuritiesCode')
    feat = feat.reset_index(drop=True)
    feat['17SectorCode'] = feat['17SectorCode'].replace(0, '10')
    feat['NewIndexSeriesSizeCode'] = feat['NewIndexSeriesSizeCode'].replace(0, '7')
    feat['NewIndexSeriesSize'] = feat['NewIndexSeriesSize'].replace(0, 'TOPIX Small 2')
    feat['Section/Products'] = feat['Section/Products'].replace(0, 'First Section (Domestic)')
    feat['33SectorName'] = feat['33SectorName'].replace(0, 'Information & Communication')
    feat['NewMarketSegment'] = feat['NewMarketSegment'].replace(0, 'Prime Market')
    
    result = best_tft.predict(feat, mode="prediction", return_x=True)
    
    feat = feat[feat.Date == current_date]

    feat['pred'] = np.array(result[0])[:, 0]
    feat["Rank"] = (feat["pred"].rank(method="first", ascending=False)-1).astype(int)
    #sample_prediction["Rank"] = feat["Rank"].values
    sample_prediction = pd.merge(sample_prediction.drop(columns = ['Rank']),feat[['SecuritiesCode','Rank']],on = 'SecuritiesCode')
    display(sample_prediction.head())
    
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1
    
    env.predict(sample_prediction)
    counter += 1