# 1dCNN starter

* For my learning, I try to use **1dCNN Model** for this competition,
and I referred [this notebook](https://www.kaggle.com/code/takamichitoda/ump-train-1dcnn-on-tpu).

* The neural network model in this notebook does not seem to be well trained, but there is a possibility to improve the accuracy by feature engineering.

* Note: I am Beginner of machine learning, so if I am missing or misunderstanding something, please let me know in the comments.

# Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
from sklearn.preprocessing import StandardScaler
import gc
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K


def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

#TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
print('Running on TPU ', tpu.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)

# config

In [None]:
class GCF:
    SEED = 0
    N_EPOCHS = 100
    BATCH_SIZE = 30000
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3

# Data Load

In [None]:
train_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
train_df

# preprocess

In [None]:
#Date convert type
train_df['Date']=pd.to_datetime(train_df['Date']) 
train_df['Date']=train_df['Date'].dt.strftime('%Y%m%d').astype(int)

# dropna columns
train_df=train_df.drop(['RowId','SecuritiesCode','AdjustmentFactor','ExpectedDividend','SupervisionFlag'],axis=1)

#dropna
train_df=train_df.dropna()

#scale
use_cols = ['Open','High','Low','Close','Volume']
ss=StandardScaler()
train_df[use_cols]=ss.fit_transform(train_df[use_cols])

train_df

# train & valid split

In [None]:
# train :80% , valid :20%
train=train_df.loc[train_df['Date']<=20201222,:]
valid=train_df.loc[train_df['Date']>20201222,:]

In [None]:
print('train_rate:',len(train)/len(train_df)*100,'%')
print('valid_size:',len(valid)/len(train_df)*100,'%')

In [None]:
#X,y split

X_train=train[use_cols]
y_train=train['Target']

X_valid=valid[use_cols]
y_valid=valid['Target']

# model

In [None]:
def create_model():
    model=keras.Sequential([
        layers.Dense(4096,activation='relu',input_shape=(5,)), #input:5columns
        layers.Reshape((256,16)),
        layers.Dropout(0.75),
        layers.Conv1D(filters=16,kernel_size=5,strides=1,activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(16,activation='relu'),
        layers.Dense(1,activation='linear'),
    ])
    model.compile(
        optimizer=tf.optimizers.Adam(1e-4),
        loss='mse',
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    return model

# run

In [None]:
#model
with strategy.scope():
    model=create_model()

early_stopping=keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=GCF.EARLY_STOPPING_PATIENCE,
    min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
    restore_best_weights=True,
)

reduce_lr=ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-5,
    verbose=1
)

#fit
history=model.fit(
    X_train.values,y_train.values,
    validation_data=(X_valid.values,y_valid.values),
    batch_size=GCF.BATCH_SIZE,
    epochs=GCF.N_EPOCHS,
    callbacks=[early_stopping,reduce_lr]
)

#predict
valid_pred=model.predict(X_valid.values)

In [None]:
import matplotlib.pyplot as plt

cols = [h.replace("val_", "") for h in history.history.keys() if 'val' in h]

for c in cols:
    pd.DataFrame(history.history)[[c, "val_"+c]].plot() #plot
    plt.title(c)
    plt.show()
    
pd.DataFrame(history.history)['lr'].plot()
plt.title('lr')
plt.show()

# evaluate
https://www.kaggle.com/code/smeitoma/train-demo/notebook

In [None]:
#result=pd.DataFrame(columns={'SecuritiesCode':,'Date'})
#result = train.loc[len(X_valid),["SecuritiesCode",'Date']].copy()
#result = valid[["SecuritiesCode",'Date']].copy()
result = valid[['Date']].copy()
#result = result.tail(len(X_valid))

# predict
result["predict"] = valid_pred
# actual result
result["Target"] = y_valid


def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

#result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

result = result.drop('Date',axis=1)

In [None]:
result.tail()

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
# calc spread return sharpe
calc_spread_return_sharpe(result, portfolio_size=200)