# Introduction

This is the inference part of my previous notebook:[**【JPX】Window-based Regression with XGBoost**](https://www.kaggle.com/code/daosword/jpx-window-based-regression-with-xgboost-train).  

In this noteboook, I will load the trained XGBoost model, preprocess testing data, and make predictions and submission.

In [None]:
import pandas as pd
import numpy as np
import joblib
from tqdm.notebook import tqdm
import jpx_tokyo_market_prediction

# Load history data

In [None]:
%%time
# Load history data
df_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
df_prices = df_prices[df_prices['Date'] >= '2021-10-01'].reset_index(drop=True)
df_prices

# Preprocess data

In [None]:
# Date features
def get_date_features(df, date_col):
    """
    Add datetime features to original dataframe
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d')
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['week'] = df[date_col].dt.isocalendar().week
    df['day'] = df[date_col].dt.day
    df['dayofweek'] = df[date_col].dt.dayofweek
    df['dayofyear'] = df[date_col].dt.dayofyear
    df[date_col] = df[date_col].astype(str)
    return df

# Shadow features - https://www.kaggle.com/code/satoshidatamoto/jpx-xgboost-with-gpu-fit-in-1-min
def get_shadow_features(df):
    """
    Add shadow features to original dataframe
    """
    df = df.copy()
    df['upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    return df

In [None]:
def preprocessing(df, date_col, feature_col, target_col, group_col, 
                  training_cutoff, num_periods_input, num_periods_output, 
                  fill_missing_train=False, fill_missing_test=False, training=True, 
                  backward_gap=1):
    """
    Preprocess training and testing data
    """
    # Add date features
    print("Adding date features..")
    df = get_date_features(df, date_col)
    date_features = ['year', 'month', 'week', 'day', 'dayofweek', 'dayofyear']
    
    # Add shadow features
    print("Adding shadow features..")
    df = get_shadow_features(df)
    shadow_features = ['upper_shadow', 'lower_shadow']
    
    # Train test split
    print("Spliting train and test data..")
    real_training_cutoff = df[df[date_col] >= training_cutoff][date_col].values[0]
    all_dates = sorted(df.Date.unique().tolist())
    cutoff_idx = all_dates.index(real_training_cutoff)
    training_cutoff_adjust = all_dates[cutoff_idx - num_periods_input - backward_gap]
    train = df[df[date_col] < training_cutoff_adjust]
    test = df[df[date_col] >= training_cutoff_adjust]
        
    # Get all features and target
    all_features = feature_col + date_features + shadow_features
    number_of_features = len(all_features)
    
    # Process missing training targets
    df_train = train[[group_col] + all_features + [target_col]].copy()
    df_train = df_train.dropna(subset=[target_col]).reset_index(drop=True).copy()
    df_test = test[[group_col] + all_features + [target_col]].copy()
            
    x_batches = []
    y_batches = []
    x_testbatches = []
    y_testbatches = []
    
    # Create train and test batches
    print("Constructing training and testing batches..")
    for group in tqdm(df[group_col].unique()):
        limit = num_periods_output + num_periods_input + backward_gap
        
        ############################ TRAIN windows ############################
        train = df_train[df_train[group_col] == group].copy().reset_index(drop=True)
        train = train.drop(group_col, axis=1)
        
        # Process missing features in training data
        if fill_missing_train:
            train = train.fillna(method='ffill')
        else:
            train = train.dropna()
        
        if training:
            y_train = train[target_col].values.astype('float32')
            x_train = train[all_features].values
            y_train = np.reshape(y_train, (len(y_train), 1))
            train = np.append(x_train, y_train, axis=1)
            
            end_train = len(train)
            start_train = 0
            next_train = 0

            while start_train + limit <= end_train:
                next_train = start_train + num_periods_input
                history_targets = train[start_train:next_train, -1].reshape(num_periods_input, 1)
                covariates = train[start_train+1+backward_gap:next_train+1+backward_gap, :-1]
                x_batches.append(np.append(covariates, history_targets, axis=1))
                y_batches.append(train[next_train+backward_gap:next_train+backward_gap+num_periods_output, -1])
                start_train = start_train + 1

        ############################ TEST windows ############################
        test = df_test[df_test[group_col] == group].copy().reset_index(drop=True)
        test = test.drop(group_col, axis=1)
        
        # Process missing features in training data
        if fill_missing_test:
            test = test.fillna(method='ffill')
        else:
            test = test.dropna()
        
        y_test = test[target_col].values.astype('float32')
        x_test = test[all_features].values
        y_test = np.reshape(y_test, (len(y_test), 1))
        test = np.append(x_test, y_test, axis=1)

        end_test = len(test)
        start_test = 0
        next_test = 0

        while start_test + limit <= end_test:
            next_test = start_test + num_periods_input
            history_targets = test[start_test:next_test, -1].reshape(num_periods_input, 1)
            covariates = test[start_test+1+backward_gap:next_test+1+backward_gap, :-1]
            x_testbatches.append(np.append(covariates, history_targets, axis=1))
            y_testbatches.append(test[next_test+backward_gap:next_test+backward_gap+num_periods_output, -1])
            start_test = start_test + 1
    
    if training:
        x_batches = np.asarray(x_batches)
        y_batches = np.asarray(y_batches)
        y_batches = y_batches.reshape(-1, num_periods_output, 1)
        print("X_train: {}, y_train: {}".format(x_batches.shape, y_batches.shape))
    
    x_testbatches = np.asarray(x_testbatches)
    y_testbatches = np.asarray(y_testbatches)
    y_testbatches = y_testbatches.reshape(-1, num_periods_output, 1)
    print("X_test:  {}, y_test:  {}".format(x_testbatches.shape, y_testbatches.shape))
    
    if training:
        return x_batches, y_batches, x_testbatches, y_testbatches
    return x_testbatches, y_testbatches

In [None]:
%%time
feature_col = ['Open', 'High', 'Low', 'Close', 'Volume']
num_periods_input = 5
num_periods_output = 1

X_test, y_test = preprocessing(
    df=df_prices, 
    date_col='Date', 
    feature_col=feature_col, 
    target_col='Target', 
    group_col='SecuritiesCode', 
    training_cutoff='2021-12-01', 
    num_periods_input=num_periods_input, 
    num_periods_output=num_periods_output,
    fill_missing_train=False,
    fill_missing_test=True,
    training=False,
    backward_gap=1,
)

In [None]:
def create_test_instances(X_test):
    All_Testing_Instances = []
    for i in tqdm(range(len(X_test))):
        hold = []
        for j in range(len(X_test[i])):
            if j == (len(X_test[i])-1):
                hold = np.concatenate((hold, X_test[i][j][:]), axis=None)
            else:
                hold = np.concatenate((hold, X_test[i][j][-1]), axis=None)
        All_Testing_Instances.append(hold)
    
    All_Testing_Instances = np.reshape(All_Testing_Instances, (len(All_Testing_Instances), len(All_Testing_Instances[0])))
    return All_Testing_Instances

In [None]:
All_Testing_Instances = create_test_instances(X_test)
All_Testing_Instances.shape

# Load trained model

In [None]:
# Load trained xgboost model
model = joblib.load('../input/jpx-trained-models-v3/JPX_xgboost_2017_2021.pkl')
model

In [None]:
# Test predictions
predictions = model.predict(All_Testing_Instances)
print(predictions.shape)
print(predictions[:10])

# Submission

In [None]:
# Make predictions and submission
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    # Combine history data with incoming new data
    display(prices)
    df_prices = pd.concat([df_prices, prices], ignore_index=True)
    training_cutoff = prices['Date'].values[0]
    print("Training cutoff: ", training_cutoff)
    
    # Get processed test data
    X_test, _ = preprocessing(
        df=df_prices, 
        date_col='Date', 
        feature_col=feature_col, 
        target_col='Target', 
        group_col='SecuritiesCode', 
        training_cutoff=training_cutoff, 
        num_periods_input=num_periods_input, 
        num_periods_output=num_periods_output,
        fill_missing_train=False,
        fill_missing_test=True,
        training=False, 
        backward_gap=1,
    )
    X_test = create_test_instances(X_test)
    
    # Make predictions
    sample_prediction['target_pred'] = model.predict(X_test)
    sample_prediction = sample_prediction.sort_values(by="target_pred", ascending=False)
    sample_prediction['Rank'] = np.arange(2000)
    sample_prediction = sample_prediction.sort_values(by="SecuritiesCode", ascending=True)
    display(sample_prediction)
    sample_prediction.drop(['target_pred'], axis=1, inplace=True)
    env.predict(sample_prediction)  # register your predictions