![](https://storage.googleapis.com/kaggle-competitions/kaggle/34349/logos/header.png?t=2022-03-09-00-33-57)

## Data Loading

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)

In [2]:
trainDataPath = "Data/train_files"
dataPath = "Data"
supDataPath = "Data/supplemental_files"

In [3]:
stock_prices = pd.read_csv(f'{trainDataPath}/stock_prices.csv')
stock_prices["Date"] = pd.to_datetime(stock_prices["Date"])
stock_prices

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.000730
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026
...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098


In [4]:
stock_list = pd.read_csv(f'{dataPath}/stock_list.csv')
stock_list['SectorName']=[i.rstrip().lower().capitalize() for i in stock_list['17SectorName']]
stock_list['Name']=[i.rstrip().lower().capitalize() for i in stock_list['Name']]

In [5]:
supplemental_prices = pd.read_csv(f"{supDataPath}/stock_prices.csv")

## EDA

Reference: [JPX Stock Market Analysis & Prediction with LGBM](https://www.kaggle.com/code/kellibelcher/jpx-stock-market-analysis-prediction-with-lgbm)

In [6]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from datetime import datetime, timedelta

init_notebook_mode(connected=True)
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly

In [7]:
def avg_plot(stock_prices):
    train_date = stock_prices.Date.unique()
    returns = stock_prices.groupby('Date')['Target'].mean().mul(
        100).rename('Average Return')
    close_avg = stock_prices.groupby(
        'Date')['Close'].mean().rename('Closing Price')
    vol_avg = stock_prices.groupby('Date')['Volume'].mean().rename('Volume')

    fig = make_subplots(rows=3, cols=1,
                        shared_xaxes=True)
    for i, j in enumerate([returns, close_avg, vol_avg]):
        fig.add_trace(go.Scatter(x=train_date, y=j,
                                mode='lines',
                                name=j.name,
                                marker_color=colors[i]),
                    row=i+1, col=1)
    fig.update_xaxes(rangeslider_visible=False,
                    rangeselector=dict(
                        buttons=list([
                            dict(count=6,
                                label="6m",
                                step="month",
                                stepmode="backward"),
                            dict(count=1,
                                label="1y",
                                step="year",
                                stepmode="backward"),
                            dict(count=2,
                                label="2y",
                                step="year",
                                stepmode="backward"),
                            dict(step="all")])),
                    row=1, 
                    col=1)
    fig.update_layout(template=temp, 
                    title='JPX Market Average Stock Return, Closing Price, and Shares Traded',
                    hovermode='x unified',
                    height=700,
                    width=1700,
                    yaxis1=dict(title='Stock Return', ticksuffix='%'),
                    yaxis2_title='Closing Price',
                    yaxis3_title='Shares Traded',
                    showlegend=False)
    fig.show()
avg_plot(stock_prices)

In [8]:
def sector_return_plot(stock_prices):
    train_df = stock_prices.merge(stock_list[['SecuritiesCode','Name','SectorName']], on='SecuritiesCode', how='left')
    train_df['Year'] = train_df['Date'].dt.year
    years = {year: pd.DataFrame() for year in train_df.Year.unique()[::-1]}
    for key in years.keys():
        df=train_df[train_df.Year == key]
        years[key] = df.groupby('SectorName')['Target'].mean().mul(100).rename("Avg_return_{}".format(key))
    df=pd.concat((years[i].to_frame() for i in years.keys()), axis=1)
    df=df.sort_values(by="Avg_return_2021")

    fig = make_subplots(rows=1, cols=5, shared_yaxes=True)
    for i, col in enumerate(df.columns):
        x = df[col]
        mask = x<=0
        fig.add_trace(go.Bar(x=x[mask], y=df.index[mask],orientation='h', 
                            text=x[mask], texttemplate='%{text:.2f}%',textposition='auto',
                            hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                            marker=dict(color='red', opacity=0.7),name=col[-4:]), 
                    row=1, col=i+1)
        fig.add_trace(go.Bar(x=x[~mask], y=df.index[~mask],orientation='h', 
                            text=x[~mask], texttemplate='%{text:.2f}%', textposition='auto', 
                            hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                            marker=dict(color='green', opacity=0.7),name=col[-4:]), 
                    row=1, col=i+1)
        fig.update_xaxes(range=(x.min()-.15,x.max()+.15), title='{} Returns'.format(col[-4:]), 
                        showticklabels=False, row=1, col=i+1)
    fig.update_layout(template=temp,
                      title='Yearly Average Stock Returns by Sector', 
                      hovermode='closest',margin=dict(l=250,r=50),
                      height=600,
                      width=1700, 
                      showlegend=False)
    fig.show()
sector_return_plot(stock_prices)

In [9]:
def sector_price_plot(stock_prices):
    train_df = stock_prices.merge(
        stock_list[['SecuritiesCode', 'Name', 'SectorName']], on='SecuritiesCode', how='left')
    train_df['Year'] = train_df['Date'].dt.year
    train_date = train_df.Date.unique()
    sectors = train_df.SectorName.unique().tolist()
    sectors.insert(0, 'All')
    open_avg = train_df.groupby('Date')['Open'].mean()
    high_avg = train_df.groupby('Date')['High'].mean()
    low_avg = train_df.groupby('Date')['Low'].mean()
    close_avg = train_df.groupby('Date')['Close'].mean()
    buttons = []

    fig = go.Figure()
    for i in range(18):
        if i != 0:
            open_avg = train_df[train_df.SectorName == sectors[i]].groupby('Date')[
                'Open'].mean()
            high_avg = train_df[train_df.SectorName == sectors[i]].groupby('Date')[
                'High'].mean()
            low_avg = train_df[train_df.SectorName == sectors[i]].groupby('Date')[
                'Low'].mean()
            close_avg = train_df[train_df.SectorName == sectors[i]].groupby('Date')[
                'Close'].mean()

        fig.add_trace(go.Candlestick(x=train_date, open=open_avg, high=high_avg,
                                     low=low_avg, close=close_avg, name=sectors[i],
                                     visible=(True if i == 0 else False)))

        visibility = [False]*len(sectors)
        visibility[i] = True
        button = dict(label=sectors[i],
                      method="update",
                      args=[{"visible": visibility}])
        buttons.append(button)

    fig.update_xaxes(rangeslider_visible=True,
                     rangeselector=dict(
                         buttons=list([
                             dict(count=3, label="3m", step="month",
                                  stepmode="backward"),
                             dict(count=6, label="6m", step="month",
                                  stepmode="backward"),
                             dict(step="all")]), xanchor='left', yanchor='bottom', y=1.16, x=.01))
    fig.update_layout(template=temp, title='Stock Price Movements by Sector',
                      hovermode='x unified',
                      showlegend=False,
                      width=1700,
                      height=800,
                      updatemenus=[dict(active=0, type="dropdown",
                                        buttons=buttons, xanchor='left',
                                        yanchor='bottom', y=1.01, x=.01)],
                      yaxis=dict(title='Stock Price'))
    fig.show()


sector_price_plot(stock_prices)


## Feature Engineering

In [10]:
stock_prices

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.000730
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026
...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098


In [39]:
from decimal import ROUND_HALF_UP, Decimal
import datetime


def adjust_price(price):
    """
    Ref: https://www.kaggle.com/code/smeitoma/train-demo#Generating-AdjustedClose-price
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated Adjusted Prices
    """

    def generate_adjusted_price(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with Adjusted Price for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df["CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df["AdjustedClose"] = (df["CumulativeAdjustmentFactor"] * df["Close"]).map(
            lambda x: float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)))
        df["AdjustedOpen"] = (
            df["CumulativeAdjustmentFactor"] * df["Open"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        df["AdjustedLow"] = (
            df["CumulativeAdjustmentFactor"] * df["Low"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        df["AdjustedHigh"] = (
            df["CumulativeAdjustmentFactor"] * df["High"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()

        df.loc[df["AdjustedOpen"] == 0, "AdjustedOpen"] = np.nan
        df.loc[:, "AdjustedOpen"] = df.loc[:, "AdjustedOpen"].ffill()

        df.loc[df["AdjustedLow"] == 0, "AdjustedLow"] = np.nan
        df.loc[:, "AdjustedLow"] = df.loc[:, "AdjustedLow"].ffill()

        df.loc[df["AdjustedHigh"] == 0, "AdjustedHigh"] = np.nan
        df.loc[:, "AdjustedHigh"] = df.loc[:, "AdjustedHigh"].ffill()

        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(
        generate_adjusted_price).reset_index(drop=True)

    return price


In [40]:
def create_features(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with new generated features
    """

    def generate_features_single_stock(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with new features for a single SecuritiesCode
        """
        
        df['Close_1week_mean'] = df['AdjustedClose'].rolling(window = 5).mean().fillna(0)
        df['Close_4weeks_mean'] = df['AdjustedClose'].rolling(window = 20).mean().fillna(0)
        df['Close_1week_std'] = df['AdjustedClose'].rolling(window = 5).std().fillna(0)
        df['Close_4weeks_std'] = df['AdjustedClose'].rolling(window = 20).std().fillna(0)
        df['Close_1week_min'] = df['AdjustedClose'].rolling(window = 5).min().fillna(0)
        df['Close_4weeks_min'] = df['AdjustedClose'].rolling(window = 20).min().fillna(0)
        df['Close_1week_max'] = df['AdjustedClose'].rolling(window = 5).max().fillna(0)
        df['Close_4weeks_max'] = df['AdjustedClose'].rolling(window = 20).max().fillna(0)
        df['Close_2weeks_return'] = df['AdjustedClose'].rolling(11).apply(lambda x: (x.iloc[-2]-x.iloc[0])/x.iloc[0])
        return df

    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_features_single_stock).reset_index(drop=True)

    return price

In [45]:
def process_raw_data(df, min_timestamp=None):
    df["Date"] = pd.to_datetime(df["Date"])
    df['DayOfWeek'] = df['Date'].apply(lambda x: x.dayofweek)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['Day'] = df['Date'].apply(lambda x: x.day)
    df['Hour'] = df['Date'].apply(lambda x: x.hour)
    df['Minute'] = df['Date'].apply(lambda x: x.minute)
    df['Timestamp'] = df['Date']
    df['id'] = df["SecuritiesCode"]
    df['Weight'] = 1
    df = adjust_price(df)
    df = create_features(df)
    df['ExpectedDividend'].fillna(0, inplace=True)
    df['DayOfYear'] = df['Date'].apply(lambda x: x.dayofyear)
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['IsMonday'] = (df['DayOfWeek'] == 0).astype(int)
    # Add continuous timeline
    if min_timestamp:
        min_timestamp = pd.to_datetime(min_timestamp)
    else:
        min_timestamp = min(df['Timestamp'])
    df['Timestep'] = df['Timestamp'].apply(lambda x: (x-min_timestamp)/datetime.timedelta(days=1))
    df['Timestep_id'] = df['Timestep']
    #df = df.set_index('Date')
    return df

In [46]:
from os.path import exists
if exists('train_df.csv'):
    train_df = pd.read_csv('train_df.csv')
else:
    train_df = process_raw_data(stock_prices)
    train_df.to_csv('train_df.csv')

In [15]:
# train_df = train_df.loc[train_df["Date"] >= "2021-01-01"].reset_index(drop=True)

In [47]:
train_df.dropna()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,DayOfWeek,Month,Day,Hour,Minute,Timestamp,id,Weight,CumulativeAdjustmentFactor,AdjustedClose,AdjustedOpen,AdjustedLow,AdjustedHigh,Close_1week_mean,Close_4weeks_mean,Close_1week_std,Close_4weeks_std,Close_1week_min,Close_4weeks_min,Close_1week_max,Close_4weeks_max,Close_2weeks_return,DayOfYear,Year,IsMonday,Timestep,Timestep_id
10,20170119_1301,2017-01-19,1301,2694.0,2717.0,2694.0,2699.0,13200,1.0,0.0,False,-0.006211,3,1,19,0,0,2017-01-19,1301,1,1.0,2699.0,2694.0,2694.0,2717.0,2701.0,0.00,13.490738,0.000000,2686.0,0.0,2722.0,0.0,-0.017505,19,2017,0,15.0,15.0
11,20170120_1301,2017-01-20,1301,2724.0,2737.0,2705.0,2737.0,26000,1.0,0.0,False,0.002574,4,1,20,0,0,2017-01-20,1301,1,1.0,2737.0,2724.0,2705.0,2737.0,2704.0,0.00,19.608672,0.000000,2686.0,0.0,2737.0,0.0,-0.014244,20,2017,0,16.0,16.0
12,20170123_1301,2017-01-23,1301,2730.0,2731.0,2712.0,2720.0,12600,1.0,0.0,False,0.001467,0,1,23,0,0,2017-01-23,1301,1,1.0,2720.0,2730.0,2712.0,2731.0,2707.2,0.00,20.873428,0.000000,2686.0,0.0,2737.0,0.0,-0.001095,23,2017,1,19.0,19.0
13,20170124_1301,2017-01-24,1301,2718.0,2728.0,2718.0,2727.0,21300,1.0,0.0,False,0.000732,1,1,24,0,0,2017-01-24,1301,1,1.0,2727.0,2718.0,2718.0,2728.0,2715.4,0.00,18.365729,0.000000,2694.0,0.0,2737.0,0.0,-0.010189,24,2017,0,20.0,20.0
14,20170125_1301,2017-01-25,1301,2732.0,2737.0,2724.0,2731.0,15100,1.0,0.0,False,-0.002195,2,1,25,0,0,2017-01-25,1301,1,1.0,2731.0,2732.0,2724.0,2737.0,2722.8,0.00,14.669697,0.000000,2699.0,0.0,2737.0,0.0,-0.006557,25,2017,0,21.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211129_9997,2021-11-29,9997,678.0,679.0,665.0,668.0,320800,1.0,0.0,False,0.026987,0,11,29,0,0,2021-11-29,9997,1,1.0,668.0,678.0,665.0,679.0,695.4,739.35,17.686153,33.309197,668.0,668.0,712.0,798.0,-0.089947,333,2021,1,1790.0,1790.0
2332527,20211130_9997,2021-11-30,9997,670.0,689.0,667.0,667.0,296300,1.0,0.0,False,-0.001460,1,11,30,0,0,2021-11-30,9997,1,1.0,667.0,670.0,667.0,689.0,686.4,732.80,18.555323,34.041152,667.0,667.0,706.0,775.0,-0.105756,334,2021,0,1791.0,1791.0
2332528,20211201_9997,2021-12-01,9997,661.0,688.0,660.0,685.0,339100,1.0,0.0,False,0.017544,2,12,1,0,0,2021-12-01,9997,1,1.0,685.0,661.0,660.0,688.0,682.8,728.30,16.115210,34.117598,667.0,667.0,706.0,770.0,-0.087551,335,2021,0,1792.0,1792.0
2332529,20211202_9997,2021-12-02,9997,681.0,692.0,680.0,684.0,342900,1.0,0.0,False,0.014368,3,12,2,0,0,2021-12-02,9997,1,1.0,684.0,681.0,680.0,692.0,678.4,724.45,10.064790,34.574824,667.0,667.0,688.0,770.0,-0.066757,336,2021,0,1793.0,1793.0


In [34]:
features = ['Date',
            'SecuritiesCode',
            'Volume',
            'DayOfWeek',
            'ExpectedDividend',
            'SupervisionFlag',
            'Month',
            'Day',
            'AdjustedClose',
            'AdjustedOpen',
            'AdjustedLow',
            'AdjustedHigh',
            'Close_1week_mean',
            'Close_4weeks_mean',
            'Close_1week_std',
            'Close_4weeks_std',
            'Close_1week_min',
            'Close_4weeks_min',
            'Close_1week_max',
            'Close_4weeks_max',
            'Close_2weeks_return',
            'DayOfYear',
            'Year',
            'IsMonday'
            ]
target = ['Target']

In [48]:
X_train = train_df.dropna()[features]
y_train = train_df.dropna()[target]

In [49]:
X_train

Unnamed: 0,Date,SecuritiesCode,Volume,DayOfWeek,ExpectedDividend,SupervisionFlag,Month,Day,AdjustedClose,AdjustedOpen,AdjustedLow,AdjustedHigh,Close_1week_mean,Close_4weeks_mean,Close_1week_std,Close_4weeks_std,Close_1week_min,Close_4weeks_min,Close_1week_max,Close_4weeks_max,Close_2weeks_return,DayOfYear,Year,IsMonday
10,2017-01-19,1301,13200,3,0.0,False,1,19,2699.0,2694.0,2694.0,2717.0,2701.0,0.00,13.490738,0.000000,2686.0,0.0,2722.0,0.0,-0.017505,19,2017,0
11,2017-01-20,1301,26000,4,0.0,False,1,20,2737.0,2724.0,2705.0,2737.0,2704.0,0.00,19.608672,0.000000,2686.0,0.0,2737.0,0.0,-0.014244,20,2017,0
12,2017-01-23,1301,12600,0,0.0,False,1,23,2720.0,2730.0,2712.0,2731.0,2707.2,0.00,20.873428,0.000000,2686.0,0.0,2737.0,0.0,-0.001095,23,2017,1
13,2017-01-24,1301,21300,1,0.0,False,1,24,2727.0,2718.0,2718.0,2728.0,2715.4,0.00,18.365729,0.000000,2694.0,0.0,2737.0,0.0,-0.010189,24,2017,0
14,2017-01-25,1301,15100,2,0.0,False,1,25,2731.0,2732.0,2724.0,2737.0,2722.8,0.00,14.669697,0.000000,2699.0,0.0,2737.0,0.0,-0.006557,25,2017,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,2021-11-29,9997,320800,0,0.0,False,11,29,668.0,678.0,665.0,679.0,695.4,739.35,17.686153,33.309197,668.0,668.0,712.0,798.0,-0.089947,333,2021,1
2332527,2021-11-30,9997,296300,1,0.0,False,11,30,667.0,670.0,667.0,689.0,686.4,732.80,18.555323,34.041152,667.0,667.0,706.0,775.0,-0.105756,334,2021,0
2332528,2021-12-01,9997,339100,2,0.0,False,12,1,685.0,661.0,660.0,688.0,682.8,728.30,16.115210,34.117598,667.0,667.0,706.0,770.0,-0.087551,335,2021,0
2332529,2021-12-02,9997,342900,3,0.0,False,12,2,684.0,681.0,680.0,692.0,678.4,724.45,10.064790,34.574824,667.0,667.0,688.0,770.0,-0.066757,336,2021,0


In [50]:
train_df

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,DayOfWeek,Month,Day,Hour,Minute,Timestamp,id,Weight,CumulativeAdjustmentFactor,AdjustedClose,AdjustedOpen,AdjustedLow,AdjustedHigh,Close_1week_mean,Close_4weeks_mean,Close_1week_std,Close_4weeks_std,Close_1week_min,Close_4weeks_min,Close_1week_max,Close_4weeks_max,Close_2weeks_return,DayOfYear,Year,IsMonday,Timestep,Timestep_id
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,0.0,False,0.000730,2,1,4,0,0,2017-01-04,1301,1,1.0,2742.0,2734.0,2730.0,2755.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,,4,2017,0,0.0,0.0
1,20170105_1301,2017-01-05,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,0.0,False,0.002920,3,1,5,0,0,2017-01-05,1301,1,1.0,2738.0,2743.0,2735.0,2747.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,,5,2017,0,1.0,1.0
2,20170106_1301,2017-01-06,1301,2734.0,2744.0,2720.0,2740.0,19900,1.0,0.0,False,-0.001092,4,1,6,0,0,2017-01-06,1301,1,1.0,2740.0,2734.0,2720.0,2744.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,,6,2017,0,2.0,2.0
3,20170110_1301,2017-01-10,1301,2745.0,2754.0,2735.0,2748.0,24200,1.0,0.0,False,-0.005100,1,1,10,0,0,2017-01-10,1301,1,1.0,2748.0,2745.0,2735.0,2754.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,,10,2017,0,6.0,6.0
4,20170111_1301,2017-01-11,1301,2748.0,2752.0,2737.0,2745.0,9300,1.0,0.0,False,-0.003295,2,1,11,0,0,2017-01-11,1301,1,1.0,2745.0,2748.0,2737.0,2752.0,2742.6,0.00,3.974921,0.000000,2738.0,0.0,2748.0,0.0,,11,2017,0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211129_9997,2021-11-29,9997,678.0,679.0,665.0,668.0,320800,1.0,0.0,False,0.026987,0,11,29,0,0,2021-11-29,9997,1,1.0,668.0,678.0,665.0,679.0,695.4,739.35,17.686153,33.309197,668.0,668.0,712.0,798.0,-0.089947,333,2021,1,1790.0,1790.0
2332527,20211130_9997,2021-11-30,9997,670.0,689.0,667.0,667.0,296300,1.0,0.0,False,-0.001460,1,11,30,0,0,2021-11-30,9997,1,1.0,667.0,670.0,667.0,689.0,686.4,732.80,18.555323,34.041152,667.0,667.0,706.0,775.0,-0.105756,334,2021,0,1791.0,1791.0
2332528,20211201_9997,2021-12-01,9997,661.0,688.0,660.0,685.0,339100,1.0,0.0,False,0.017544,2,12,1,0,0,2021-12-01,9997,1,1.0,685.0,661.0,660.0,688.0,682.8,728.30,16.115210,34.117598,667.0,667.0,706.0,770.0,-0.087551,335,2021,0,1792.0,1792.0
2332529,20211202_9997,2021-12-02,9997,681.0,692.0,680.0,684.0,342900,1.0,0.0,False,0.014368,3,12,2,0,0,2021-12-02,9997,1,1.0,684.0,681.0,680.0,692.0,678.4,724.45,10.064790,34.574824,667.0,667.0,688.0,770.0,-0.066757,336,2021,0,1793.0,1793.0


## Model

In [22]:
import optuna

In [23]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [24]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

In [51]:
import lightgbm as lgb


def objective_lgb(trial, X_train, y_train):

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-6, 1e-3),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 20),
        # 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 4500, 8500),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 0.9),
        'subsample': trial.suggest_uniform('subsample', 0.001, 1.0),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 5)
    }

    lgbm = lgb.LGBMRegressor(random_state=42,
                             **params,
                             device='gpu')
    # pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
    callbacks = [lgb.early_stopping(300, verbose=0),
                 lgb.log_evaluation(period=0),
                #  pruning_callback
                 ]

    ts_fold = TimeSeriesSplit(n_splits=5, gap=10000)
    sharpe_ratio = []
    for fold, (train_idx, val_idx) in enumerate(ts_fold.split(X_train, y_train)):
        
        print(f'fitting fold {fold+1}')
        
        X_trainS, y_trainS = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_validS, y_validS = X_train.iloc[val_idx], y_train.iloc[val_idx]

        X_tra = X_trainS[X_trainS.columns[~X_trainS.columns.isin(
            ['Date', 'SecuritiesCode'])]]
        X_val = X_validS[X_validS.columns[~X_validS.columns.isin(
            ['Date', 'SecuritiesCode'])]]

        lgbm.fit(
            X_tra,
            y_trainS,
            eval_set=[(X_val, y_validS)],
            callbacks=callbacks
        )

        y_pred = lgbm.predict(X_val)

        X_validS = X_validS.assign(Target=y_pred)
        X_validS = X_validS.assign(Rank=(X_validS.groupby("Date")['Target'].rank(
            method="first", ascending=False)-1).astype(int))

        sharpe = calc_spread_return_sharpe(X_validS)

        del X_trainS, y_trainS,  X_validS, y_validS
        
        sharpe_ratio.append(sharpe)

    return np.mean(sharpe_ratio), np.std(sharpe_ratio)


study_lgb = optuna.create_study(study_name="LGBM Tuner", directions=["maximize","minimize"])
study_lgb.optimize(
    lambda trial: objective_lgb(
        trial,
        X_train,
        y_train),
    n_trials=300)

[32m[I 2022-06-25 03:08:22,987][0m A new study created in memory with name: LGBM Tuner[0m


fitting fold 1


In [None]:
optuna.visualization.plot_optimization_history(study_lgb, target=lambda t: t.values[1], target_name="Stv")

In [None]:
optuna.visualization.plot_optimization_history(study_lgb, target=lambda t: t.values[0], target_name="Sharpe")

In [None]:
trial_with_highest_accuracy = max(study_lgb.best_trials, key=lambda t: t.values[1])
print(f"Trial with highest accuracy: ")
print(f"\tnumber: {trial_with_highest_accuracy.number}")
print(f"\tparams: {trial_with_highest_accuracy.params}")
print(f"\tvalues: {trial_with_highest_accuracy.values}")

Trial with highest accuracy: 
	number: 11
	params: {'reg_alpha': 0.0002916652833783697, 'reg_lambda': 7.718851027678918, 'num_leaves': 20, 'n_estimators': 5644, 'colsample_bytree': 0.8488881916266258, 'subsample': 0.5472863817142423, 'min_child_samples': 5}
	values: [1.6230123046287557, 0.27115214395966614]


In [None]:
optuna.visualization.plot_slice(study_lgb, target=lambda t: t.values[0], target_name="Sharpe")

In [None]:
optuna.visualization.plot_slice(study_lgb, target=lambda t: t.values[1], target_name="Stv")

In [None]:
import xgboost as xgb


def objective_xgb(trial, X_train, y_train):
    """
    Objective function to tune an `XGBRegressor` model.
    """

    # Define Parameter Grid to Tune
    params = {
        'n_estimators': trial.suggest_int("n_estimators", 100, 5000),
        'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-10, 1e-1),
        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0, step=0.001),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int("max_depth", 2, 9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.005),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1.0, step=0.05),
        "gamma": trial.suggest_float("gamma", 0, 0.1, step=0.0005),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10),
        'nthread': -1,
        'booster': "gbtree",
        'objective': "reg:squarederror",
        'early_stopping_rounds': 300,
        'random_state': 42,
        'eval_metric': "rmse"
    }
    
    # Check and enable GPU accelerate
    GPU_ENABLED = True
    if GPU_ENABLED:
        params["tree_method"] = "gpu_hist"
        params["predictor"] = "gpu_predictor"
        
    # Tune pruning
    # from optuna.integration import XGBoostPruningCallback
    # pruning_callback = XGBoostPruningCallback(trial, "validation_0-rmse")

    model = xgb.XGBRegressor(
        # callbacks=[pruning_callback],
        **params
    )




    # K-Fold CV
    ts_fold = TimeSeriesSplit(n_splits=5, gap=10000)
    sharpe_ratio = []
    for fold, (train_idx, val_idx) in enumerate(ts_fold.split(X_train, y_train)):

        print(f'fitting fold {fold+1}')
        
        X_trainS, y_trainS = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_validS, y_validS = X_train.iloc[val_idx], y_train.iloc[val_idx]

        X_tra = X_trainS[X_trainS.columns[~X_trainS.columns.isin(
            ['Date', 'SecuritiesCode'])]]
        X_val = X_validS[X_validS.columns[~X_validS.columns.isin(
            ['Date', 'SecuritiesCode'])]]

        model.fit(
            X_tra,
            y_trainS,
            eval_set=[(X_val, y_validS)],
            verbose=False
        )

        y_pred = model.predict(X_val)

        X_validS = X_validS.assign(Target=y_pred)
        X_validS = X_validS.assign(Rank=(X_validS.groupby("Date")['Target'].rank(
            method="first", ascending=False)-1).astype(int))

        sharpe = calc_spread_return_sharpe(X_validS)

        del X_trainS, y_trainS,  X_validS, y_validS, X_tra, X_val

        sharpe_ratio.append(sharpe)

    return np.mean(sharpe_ratio), np.std(sharpe_ratio)


study_xgb = optuna.create_study(study_name="XGB Tuner", directions=["maximize","minimize"])
study_xgb.optimize(
    lambda trial: objective_xgb(
        trial,
        X_train,
        y_train),
    n_trials=300)
xgbBestPara = study_xgb.best_params


In [132]:
optuna.visualization.plot_optimization_history(study_xgb, target=lambda t: t.values[1], target_name="Stv")

In [133]:
optuna.visualization.plot_optimization_history(study_xgb, target=lambda t: t.values[0], target_name="Sharpe")

In [122]:
trial_with_highest_accuracy = max(study_xgb.best_trials, key=lambda t: t.values[1])
print(f"Trial with highest accuracy: ")
print(f"\tnumber: {trial_with_highest_accuracy.number}")
print(f"\tparams: {trial_with_highest_accuracy.params}")
print(f"\tvalues: {trial_with_highest_accuracy.values}")

Trial with highest accuracy: 
	number: 60
	params: {'n_estimators': 3524, 'reg_alpha': 0.0036814497591722635, 'reg_lambda': 2.5413032105195913e-07, 'subsample': 0.915, 'learning_rate': 0.030567340251603986, 'max_depth': 5, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7, 'colsample_bynode': 0.0, 'gamma': 0.0005, 'min_child_weight': 1}
	values: [1.3998142051296039, 0.1888585576895188]


In [134]:
optuna.visualization.plot_slice(study_xgb, target=lambda t: t.values[0], target_name="Sharpe")

In [135]:
optuna.visualization.plot_slice(study_xgb, target=lambda t: t.values[1], target_name="Stv")

## Submission

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()