In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load stock prices CSV file, get useful features
df_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
df_stocks = df_prices[['AdjustmentFactor', 'Open', 'Close', 'High', 'Low', 'Volume', 'Date', 'SecuritiesCode', 'Target']].copy()

# Convert Date column to datetime data type
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'])

# Remove missing data
df_stocks.dropna(inplace=True)

In [None]:
from decimal import ROUND_HALF_UP, Decimal

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True, drop=False)
    return price

df_stocks = adjust_price(df_stocks) 
df_stocks.drop(columns=['AdjustmentFactor', 'CumulativeAdjustmentFactor'], inplace=True)
df_stocks['Close'] = df_stocks['AdjustedClose'].copy(deep=True)
df_stocks.drop(columns=['AdjustedClose'], inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

# normalize open and close prices
ss = StandardScaler()
df_stocks[['Open', 'Close',  'High', 'Low', 'Volume']] = ss.fit_transform(df_stocks[['Open', 'Close', 'High', 'Low', 'Volume']])

In [None]:
# create data frame for each stock
df_stock = []
for id_val in df_prices['SecuritiesCode'].unique():
    df_stock.append(df_stocks.loc[(df_stocks['SecuritiesCode'] == id_val)])

In [None]:
# create list of time step columns
def create_column_list(time_steps, features):

    cols = []
    for feature in features:
        for ii in range(0, time_steps+1):
            cols.append(feature+str(ii))

    return cols

# Shift to create time-series data format dataframe for each stock
def shift_reorder(df_stock, time_steps, features, other_cols):
    
    cols = create_column_list(time_steps, features)
    for col in other_cols:
        cols.append(col)
        
    print(cols)
    
    for ii in range(len(df_stock)):
        for feature in features:
            df_stock[ii] = df_stock[ii].rename(columns={feature: feature+str(time_steps)})

            for jj in range(time_steps):
                df_stock[ii][feature+str(time_steps-jj-1)] = df_stock[ii][feature+str(time_steps-jj)].shift(periods=1, fill_value=np.NaN)
        
            # reorder columns
        df_stock[ii] = df_stock[ii][cols].copy(deep=True) 

time_steps = 8

# create shifted time-series format dataframes
feats = ['Open', 'Close', 'High', 'Low', 'Volume']
others = ['Date', 'SecuritiesCode', 'Target']

shift_reorder(df_stock, time_steps, features=feats, other_cols=others)

In [None]:
# add stock list information to each security - from https://www.kaggle.com/code/kellibelcher/jpx-stock-market-analysis-prediction-with-lgbm
df_stock_list = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')

def add_sl_features(stocks, stock_list):
    stock_list['SectorName17']=[i.rstrip().lower().capitalize() for i in stock_list['17SectorName']]
    stock_list['SectorName33']=[i.rstrip().lower().capitalize() for i in stock_list['33SectorName']]
    stock_list['SectionProducts']=[i.rstrip().lower().capitalize() for i in stock_list['Section/Products']]
    
    for ii in range(len(stocks)):
        code = stocks[ii]['SecuritiesCode'].unique()[0]
        stocks[ii] = stocks[ii].assign(SectorName17=stock_list.loc[stock_list['SecuritiesCode'] == code]['SectorName17'].unique()[0])
        stocks[ii] = stocks[ii].assign(SectorName33=stock_list.loc[stock_list['SecuritiesCode'] == code]['SectorName33'].unique()[0])
        stocks[ii] = stocks[ii].assign(SectionProducts=stock_list.loc[stock_list['SecuritiesCode'] == code]['SectionProducts'].unique()[0])
        stocks[ii] = stocks[ii].assign(NewMarketSegment=stock_list.loc[stock_list['SecuritiesCode'] == code]['NewMarketSegment'].unique()[0])

add_sl_features(df_stock, df_stock_list)

In [None]:
# join data from each stock into one dataframe
df_stock_full = pd.concat(df_stock)

In [None]:
import datetime as dt

# from https://analyticsindiamag.com/how-to-use-xgboost-for-time-series-analysis/
def create_time_features(dataframe):

    date_column = dataframe['Date'].copy()
    
    dataframe['hour'] = date_column.dt.hour
    dataframe['dayofweek'] = date_column.dt.dayofweek
    dataframe['quarter'] = date_column.dt.quarter
    dataframe['month'] = date_column.dt.month
    dataframe['year'] = date_column.dt.year
    dataframe['dayofyear'] = date_column.dt.dayofyear
    dataframe['dayofmonth'] = date_column.dt.day

# create features from datetime values
create_time_features(df_stock_full)

# drop date feature
df_stock_full.drop(columns=['Date'], inplace=True)

In [None]:
# Create dummy variables for categorical features
df_stock_full = pd.get_dummies(df_stock_full, dummy_na=True, columns=['SectorName17', 'SectorName33', 'SectionProducts', 'NewMarketSegment'], drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

def split_train_target(stocks):

    df_target = stocks[['Target']].copy(deep=True)
    df_train = stocks.drop(columns=['Target'], axis=0)

    return df_train, df_target

# define training set and target
df_stock_train, df_stock_target = split_train_target(df_stock_full)

# train-test split
X_train, X_valid, y_train, y_valid = train_test_split(df_stock_train, df_stock_target, test_size = 0.2, shuffle=True)

In [None]:
#from scipy.stats import uniform
#from sklearn.model_selection import RandomizedSearchCV


## Searching for regularization parameters
#distributions = {'alpha': uniform(loc=0.1, scale=100),
#                 'gamma': uniform(loc=0.1, scale=20),
#                 'lambda': uniform(loc=0.1, scale=20),
#                 'learning_rate': uniform(loc=0.001, scale=1),
#                 'max_depth': [4, 8, 12, 16, 20],
#                 'eval_metric': ['mae'],
#                 'n_estimators': [25, 50, 75, 100, 125, 150],
#                 'min_child_weight': [5, 10, 15, 20],
#                 'colsample_bytree': uniform(loc=0.1, scale=0.89),
#                 'objective': ['reg:squarederror'],
#                 'subsample': uniform(loc=0.1, scale=0.89)}

#clf = RandomizedSearchCV(estimator=xgb.XGBRegressor(), param_distributions=distributions, n_iter=25, random_state=0)
#search = clf.fit(X_train, y_train)
#print(search.best_params_)

In [None]:
# convert to XGB matrix datatype
X_train_D = xgb.DMatrix(X_train, label=y_train)
X_valid_D = xgb.DMatrix(X_valid, label=y_valid)

# create, fit model
params = {"eval_metric": "rmsle", "objective": "reg:squarederror", "max_depth": 20, "tree_method": "gpu_hist", "gpu_id": 0,
             "learning_rate": 0.1}
clf = xgb.train(params, X_train_D, num_boost_round=250, early_stopping_rounds=5, evals=[(X_train_D, "training"), (X_valid_D, "validation")])
#clf = xgb.XGBRegressor()
#clf.load_model("model.json")

In [None]:
import matplotlib.pyplot as plt

# plot feature importance 
xgb.plot_importance(clf, importance_type='cover', title='Feature Importance: Cover', max_num_features=20)
xgb.plot_importance(clf, importance_type='gain', title='Feature Importance: Gain', max_num_features=20)
xgb.plot_importance(clf, importance_type='weight', title='Feature Importance: Weight', max_num_features=20)

In [None]:
clf.save_model("model.json")

In [None]:
# used https://www.kaggle.com/code/kellibelcher/jpx-stock-market-analysis-prediction-with-lgbm

import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

cols=['AdjustmentFactor', 'Date', 'SecuritiesCode', 'High', 'Low', 'Open', 'Close', 'Volume']
df_prices=df_prices[df_prices.Date>='2021-08-01'][cols]

counter = 0
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:

    current_date = prices["Date"].iloc[0]
    if counter == 0:
        df_price_raw = df_prices.loc[df_prices["Date"] < current_date]
            
    df_price_raw.dropna(inplace=True, axis=0)
    df_price_raw = pd.concat([df_price_raw, prices[cols]]).reset_index(drop=True)
    
    df_price = adjust_price(df_price_raw)
    df_price.drop(columns=['AdjustmentFactor'], inplace=True)
    
    df_price['Date'] = pd.to_datetime(df_price['Date'])
    df_price[['Open', 'Close',  'High', 'Low', 'Volume']] = ss.transform(df_price[['Open', 'Close', 'High', 'Low', 'Volume']])
    
    df_stock = []
    for id_val in df_price['SecuritiesCode'].unique():
        df_stock.append(df_price.loc[(df_price['SecuritiesCode'] == id_val)])
    
    feats = ['Open', 'Close', 'High', 'Low', 'Volume']
    others = ['Date', 'SecuritiesCode']
    shift_reorder(df_stock, time_steps, features=feats, other_cols=others)
    add_sl_features(df_stock, df_stock_list)
    df_price_full = pd.concat(df_stock)
    create_time_features(df_price_full)
    df_price_full = pd.get_dummies(df_price_full, dummy_na=True, columns=['SectorName17', 'SectorName33', 'SectionProducts', 'NewMarketSegment'], drop_first=True)
    
    feat = df_price_full[df_price_full.Date == current_date]
    
    feat.drop(columns=['Date'], inplace=True)
    feat = feat[X_train.columns]
    feat["pred"] = clf.predict(xgb.DMatrix(feat))
    feat["Rank"] = (feat["pred"].rank(method="first", ascending=False)-1).astype(int) # flipped from ascending=False due to negative score
    sample_prediction["Rank"] = feat["Rank"].values
    display(sample_prediction.head())
    
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1
    
    print(f'Count = {counter}')
    
    env.predict(sample_prediction)
    counter += 1