In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras import regularizers
from keras.models import Model, Sequential
from keras.layers import Conv1D, Dense, Flatten, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import jpx_tokyo_market_prediction

pd.options.mode.chained_assignment = None  # default='warn'

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Going to go really basic and predict close price
we can reconstruct target later

In [None]:
df_stock_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
print(f"What does stock_prices data look like?")
display(df_stock_prices.head(2))
## Since with close we can calc the target lets only use this for now
df_stock_prices_mdf = df_stock_prices[['Date','SecuritiesCode','Close']]
# df_stock_prices_mdf.columns =[col.lower() for col in df_stock_prices_mdf.columns]
# ## Make this a ts DF
df_stock_prices_mdf["Date"] = pd.to_datetime(df_stock_prices_mdf["Date"])
df_stock_prices_mdf.index = df_stock_prices_mdf.Date
df_stock_prices_mdf.pop('Date')

print(f"stock_prices minimum data required")
display(df_stock_prices_mdf.head(2))


In [None]:
## Reconstruct per securities code
l_securities=list(df_stock_prices_mdf.SecuritiesCode.values)
rebuild_stock_prices = pd.DataFrame()
for i_ter, security in  enumerate(l_securities):
    single_stock_prices = df_stock_prices_mdf[df_stock_prices_mdf['SecuritiesCode']==security]
    ## Thanks to @chumajin for the target calculation
    single_stock_prices["calc_target"] = (single_stock_prices["Close"].shift(-2) - single_stock_prices["Close"].shift(-1)) / single_stock_prices["Close"].shift(-1)
    rebuild_stock_prices=pd.concat([rebuild_stock_prices,single_stock_prices],axis=0)
    ## Get first 15
    if i_ter==14: break
        
rebuild_stock_prices.sort_values(["Date", "SecuritiesCode"],ascending = [True, True])

In [None]:
# plot first 15 securities / Close
df = rebuild_stock_prices.pivot( columns='SecuritiesCode', values='Close')
df.plot( subplots=False, figsize=(24, 14), fontsize=10)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Close', fontsize=12)
plt.title('1301-1518', fontsize=13)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

In [None]:
# plot first 15 securities / Target
df = rebuild_stock_prices.pivot( columns='SecuritiesCode', values='calc_target')
df.plot( subplots=False, figsize=(24, 14), fontsize=10)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.title('1301-1518', fontsize=13)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

# Totally Different. Will evaluate later 

In [None]:
## We see some changes/patterns using Close price
## Lets plot some in batches of 25 to visually inspect 
NO_OF_GRAPHS = 4
GRAPH_BATCHES = 25
s_title=""
r_cnt=0

rebuild_stock_prices = pd.DataFrame()
for i_ter, security in  enumerate(l_securities):
    s_title += ','+str(security)    
    single_stock_prices = df_stock_prices_mdf[df_stock_prices_mdf['SecuritiesCode']==security]
    rebuild_stock_prices=pd.concat([rebuild_stock_prices,single_stock_prices],axis=0)
    ## Get per batch
    if (i_ter+1)%GRAPH_BATCHES==0:
        # plot 25 securities
        df = rebuild_stock_prices.pivot( columns='SecuritiesCode', values='Close')
        df.plot( subplots=False, figsize=(24, 14), fontsize=12,legend=False)
        plt.xlabel('Date', fontsize=12)
        plt.ylabel('Close', fontsize=12)
        plt.title(s_title[1:], fontsize=13)
        s_title=""
        plt.show()
        r_cnt+=1
    if r_cnt ==NO_OF_GRAPHS: break

#### Lets have some missing value metrics here

In [None]:
## There are Nans in the Close price but not Nan's in Target. We need a Data Imputation strategy for this
#### 1301 SCode
stock_price_1301 = df_stock_prices.loc[df_stock_prices.SecuritiesCode==1301,['Date','SecuritiesCode','Close','Target']]
display(stock_price_1301.count())
display(stock_price_1301[stock_price_1301['Close'].isna()])
display(stock_price_1301[stock_price_1301['Target'].isna()])
#### 1332 SCode
stock_price_1332 = df_stock_prices.loc[df_stock_prices.SecuritiesCode==1332,['Date','SecuritiesCode','Close','Target']]
display(stock_price_1332.count())
display(stock_price_1332[stock_price_1332['Close'].isna()])
display(stock_price_1332[stock_price_1332['Target'].isna()])
#### 2761 SCode
stock_price_2761 = df_stock_prices.loc[df_stock_prices.SecuritiesCode==2761,['Date','SecuritiesCode','Close','Target']]
display(stock_price_2761.count())
display(stock_price_2761[stock_price_2761['Close'].isna()])
display(stock_price_2761[stock_price_2761['Target'].isna()])
#### all DF
display(df_stock_prices[['Date','SecuritiesCode','Close','Target']].count())
display(df_stock_prices.loc[df_stock_prices['Close'].isna(),['Date','SecuritiesCode','Close','Target']])
display(df_stock_prices.loc[df_stock_prices['Target'].isna(),['Date','SecuritiesCode','Close','Target']])

Some nan's where I did not expect. Will expand here

### Prepare data for a CNN here.


#### Split


We will start with predicting 1 Security

Splitting needs to happen on a time scale so.... no reordering

In [None]:
### Data split Params
VALIDATION_START = '2020-11-01'
TEST_START = '2021-08-01'

### Prediction PArams
W = 6      # previous samples or window
FUTURE = 2 # d+2

### Model Params
LATENT_SPACE = 7
KERNEL = 2
BATCH_SIZE = 32
EPOCHS = 200

In [None]:
stock_price_1301 = df_stock_prices.loc[df_stock_prices.SecuritiesCode==1301,['Date','SecuritiesCode','Close','Target']]
## Train terst validation split
display(stock_price_1301[(stock_price_1301['Date']>=VALIDATION_START)&(stock_price_1301['Date']<TEST_START)].count())
display(stock_price_1301[stock_price_1301['Date']>=TEST_START].count())



In [None]:
### Lets see the whole thing
stock_price_1301["Date"] = pd.to_datetime(stock_price_1301["Date"])
stock_price_1301.index = stock_price_1301.Date
stock_price_1301.pop('Date')
# stock_price_1301

df = stock_price_1301.pivot( columns='SecuritiesCode', values='Close')
df.plot( subplots=False, figsize=(16, 10), fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Close', fontsize=12)
plt.title('1301', fontsize=14)
plt.show()

In [None]:
## Now lets see it split
stock_price_1301[stock_price_1301.index < VALIDATION_START][['Close']].rename(columns={'Close':'train'}) \
    .join(stock_price_1301[(stock_price_1301.index >=VALIDATION_START) & (stock_price_1301.index < TEST_START)][['Close']] \
          .rename(columns={'Close':'validation'}), how='outer') \
    .join(stock_price_1301[TEST_START:][['Close']].rename(columns={'Close':'test'}), how='outer') \
    .plot(y=['train', 'validation', 'test'], figsize=(15, 8), fontsize=12)
plt.xlabel('timestamp', fontsize=12)
plt.ylabel('Close', fontsize=12)
plt.show()

#### Prepare Train Dataset on a +FUTURE range with a -W window

In [None]:
train = stock_price_1301.copy()[stock_price_1301.index < VALIDATION_START][['Close']]
scaler = MinMaxScaler()
train['Close'] = scaler.fit_transform(train)
##### Lets see if this worked
stock_price_1301[stock_price_1301.index < VALIDATION_START][['Close']].rename(columns={'Close':'original Close'}).plot.hist(bins=100, fontsize=12)
train.rename(columns={'Close':'scaled Close'}).plot.hist(bins=100, fontsize=12)
plt.show()

#### A Function for Preparing Data

In [None]:
def get_data_split_transform(single_security_data, scaler, split_type):
    if split_type== 'TRAIN': data = single_security_data.copy()[single_security_data.index < VALIDATION_START][['Close']]
    elif split_type== 'VALID': 
        look_back_dt = dt.datetime.strptime(VALIDATION_START, '%Y-%m-%d') - dt.timedelta(days=W-2) # Go 2 days back from validation to capture +2 window
        data = single_security_data.copy()[(single_security_data.index >=look_back_dt) & (single_security_data.index < TEST_START)][['Close']]
    elif split_type== 'TEST': 
        look_back_dt = dt.datetime.strptime(TEST_START, '%Y-%m-%d') - dt.timedelta(days=W-2)
        data = single_security_data.copy()[TEST_START:][['Close']]
    elif split_type== 'VALIDREST': 
        look_back_dt = dt.datetime.strptime(VALIDATION_START, '%Y-%m-%d') - dt.timedelta(days=W-2)
        data = single_security_data.copy()[VALIDATION_START:][['Close']]
    else: data = single_security_data.copy()
    data['Close'] = scaler.fit_transform(data)
    return data

def prepare_data(data):
    data_shifted = data.copy()
    data_shifted['y_d+2'] = data_shifted['Close'].shift(-2) ## Shift 2

    for t in range(1, W+1):
        data_shifted['close-'+str(W-t)] = data_shifted['Close'].shift(W-t) ## Create lookback 
    data_shifted = data_shifted.rename(columns={'Close':'Close_original'})

    data_shifted = data_shifted.dropna(how='any') ## Drop nan's

    y_ = data_shifted[['y_d+2']].values ## Extract y values

    X_ = data_shifted[['close-'+str(W-t) for t in range(1, W+1)]].values ## Extract values
    X_ = X_[... , np.newaxis] ## Reshape
    
    return X_,y_,data_shifted

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:
X_train,y_train,_ = prepare_data(train)
display(X_train.shape)
display(y_train.shape)



#### Prepare Validation Dataset

In [None]:
look_back_dt = dt.datetime.strptime(VALIDATION_START, '%Y-%m-%d') - dt.timedelta(days=W-2) # Go 2 days back from validation to capture +2
valid = stock_price_1301.copy()[(stock_price_1301.index >=look_back_dt) & (stock_price_1301.index < TEST_START)][['Close']]
valid['Close'] = scaler.transform(valid)
### Prepare data here
X_valid,y_valid,_ = prepare_data(valid)
display(X_valid.shape)
display(y_valid.shape)


### Model a CNN for 1301

In [None]:
def prepare_model(latent_space, kernel_size, window_size, future_size):
    model = Sequential()
    model.add(Conv1D(latent_space, kernel_size=kernel_size, padding='causal', strides=1, activation='relu', dilation_rate=1, input_shape=(window_size, 1)))
    model.add(BatchNormalization())
    model.add(Conv1D(latent_space, kernel_size=kernel_size, padding='causal', strides=1, activation='relu', dilation_rate=2,  activity_regularizer=regularizers.L2(0.01)))
    model.add(Dropout(0.25))
    model.add(Conv1D(latent_space, kernel_size=kernel_size, padding='causal', strides=1, activation='relu', dilation_rate=4,  activity_regularizer=regularizers.L2(0.01)))
    model.add(Flatten())
    model.add(Dense(future_size, activation='linear'))
    
    model.compile(optimizer='Adam', loss='mse') ## Compile
    
    return model

def build_evaluation_data(predictions, y_ ,security, index):
    df = pd.DataFrame(predictions, columns=['d+'+str(t) for t in range(1, FUTURE+1)])
    df['Date'] = index
    df['SecuritiesCode'] = security
    df = df[[f'd+{FUTURE}', 'Date', 'SecuritiesCode']].copy()
    df = pd.melt(df, id_vars=['Date', 'SecuritiesCode'], value_name='prediction', var_name='d')
    df['actual'] = np.transpose(y_).ravel()
    df[['prediction', 'actual']] = scaler.inverse_transform(df[['prediction', 'actual']])
    df['prediction+1']=df["prediction"].shift(-1)
    df['prediction+2']=df["prediction"].shift(-2)
    df = df[['Date','SecuritiesCode','actual','prediction+2','prediction']]
    df.columns = ['Date','SecuritiesCode','Close','ClosePrediction','Prediction' ]
    return df

model = prepare_model(LATENT_SPACE, KERNEL,W, FUTURE)
display(model.summary())


### Train

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10)
best_val = ModelCheckpoint('model_{epoch:02d}.h5', save_best_only=True, mode='min', period=1)
history = model.fit(X_train,
          y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=(X_valid, y_valid),
          callbacks=[earlystop, best_val],
          verbose=1)
best_epoch = np.argmin(np.array(history.history['val_loss']))+1
model.load_weights("model_{:02d}.h5".format(best_epoch))

#### Lets see the results .... 

In [None]:
plot_df = pd.DataFrame.from_dict({'train_loss':history.history['loss'], 'val_loss':history.history['val_loss']})
plot_df.plot(logy=True, figsize=(10,10), fontsize=12)
plt.xlabel('epoch', fontsize=12)
plt.ylabel('loss', fontsize=12)
plt.show()

### Evaluate the model for 1301
Not bad now lets eval

In [None]:
### Again look back

look_back_dt = dt.datetime.strptime(TEST_START, '%Y-%m-%d') - dt.timedelta(days=W-2)
test = stock_price_1301.copy()[TEST_START:][['Close']]
test['Close'] = scaler.transform(test)

In [None]:
X_test,y_test,test_shifted = prepare_data(test)
display(X_test.shape)
display(y_test.shape)


#### Predict on test

In [None]:
predictions = model.predict(X_test)
predictions.shape

### Lets see this visually

#### Build eval df

In [None]:
eval_df = build_evaluation_data(predictions, y_test ,1301, test_shifted.index)
eval_df


In [None]:
eval_df.plot(x='Date', y=['ClosePrediction', 'Close'], style=['r', 'b'], figsize=(15, 8))
plt.xlabel('Date', fontsize=12)
plt.ylabel('Close', fontsize=12)
plt.title('ClosePrediction', fontsize=14)
plt.show()

I'll revisit later....

In [None]:
print( f" Mean absolute percentage error:{((eval_df['ClosePrediction'] - eval_df['Close']).abs() / eval_df['Close']).mean()}" )

## Predict Per security
lets put things together

In [None]:
## Predict per securities code
l_securities=list(df_stock_prices.SecuritiesCode.values)
rebuild_stock_prices = pd.DataFrame()
for i_ter, security in  enumerate(l_securities):
    single_stock_prices = df_stock_prices_mdf[df_stock_prices_mdf['SecuritiesCode']==security]
    scaler = MinMaxScaler()
    train = get_data_split_transform(single_stock_prices,scaler,'TRAIN') ## Split Train Data
    X_train,y_train,_ = prepare_data(train) ## Prepare Train Data
    valid = get_data_split_transform(single_stock_prices,scaler,'VALIDREST') ## Get the rest as Validation Data
    X_valid,y_valid,_ = prepare_data(valid) ## Prepare Validation Data
    model = prepare_model(LATENT_SPACE, KERNEL,W, FUTURE)
    all_data = get_data_split_transform(single_stock_prices,scaler,'NONE') ## Just scale all Close Data
    X_all,y_all,shifted_all = prepare_data(all_data) ## Prepare Train Data
    predictions = model.predict(X_all)
    eval_df = build_evaluation_data(predictions, y_all ,security, shifted_all.index)
    rebuild_stock_prices=pd.concat([rebuild_stock_prices,eval_df],axis=0)
#     if i_ter==1: break

        
rebuild_stock_prices.sort_values(["Date", "SecuritiesCode"],ascending = [True, True])

In [None]:
# eval_df = pd.DataFrame(predictions, columns=['d+'+str(t) for t in range(1, FUTURE+1)])
# # display(eval_df.head())
# eval_df['Date'] = test_shifted.index
# # display(eval_df.head())
# eval_df = eval_df[['d+2', 'Date']].copy()
# # display(eval_df.head())
# eval_df = pd.melt(eval_df, id_vars='Date', value_name='prediction', var_name='d')
# # display(eval_df.head())
# eval_df['actual'] = np.transpose(y_test).ravel()
# # display(eval_df.head())
# eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
# # display(eval_df.head())
# eval_df["prediction+1"]=eval_df["prediction"].shift(-1)
# eval_df["prediction+2"]=eval_df["prediction"].shift(-2)
# eval_df

In [None]:
# Make a submission
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    sample_prediction.head(10)    
    sample_prediction['target_pred'] = eval_df["ClosePrediction"]
    sample_prediction = sample_prediction.sort_values(by="target_pred", ascending=False)
    sample_prediction['Rank'] = np.arange(2000)
    sample_prediction = sample_prediction.sort_values(by="SecuritiesCode", ascending=True)
    sample_prediction.drop(['target_pred'], axis=1, inplace=True)
    display(sample_prediction)
    env.predict(sample_prediction)   # register your predictions

## 