# Preamble

**I have seen some of the notebook which are using regular CV methods for time series data, I wanted to implement TS-CV technique by myself.**

**Unfortunately, I couldn't manage to find a good implementation of TS-CV library, so I have created algorithm by myself.**

# Config Parameters and Imports

In [None]:
# You can tweak the hyperparameters for different results.
# Best Model Hyperparameters (Optuna) {'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = Score: 0.537631972137107.
class CFG:
    Debug = False # Enable/Disable debug mode True = Enable, False = Disable
    
    folds = 5
    val_ratio = 20 # validation dataset to train dataset ratio in % format
    
    mean = True # if it is True, submission API uses the mean value of our folds, uses median value if it is False.
    
    seed = 1889
    LR = 0.6268164565853203
    num_leaves = 2818
    n_estimators = 713
    max_bin = 100

In [None]:
import numpy as np

import pandas as pd 
pd.options.mode.chained_assignment = None 

from lightgbm import LGBMRegressor

#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#from sklearn.model_selection import cross_val_score

from decimal import ROUND_HALF_UP, Decimal

#import optuna

!pip install ../input/talib0419/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
import talib

# Import Data and Handle Invalid Values

**Be careful, do not drop NaN values before dropping Expected Dividend features, it causes a major issue.**

In [None]:
train_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
train_df = train_df.drop(['ExpectedDividend'], axis=1) # trivial imo
#train_df = train_df.dropna() # DO NOT! (note to myself lol)
train_df.interpolate(method='linear', inplace=True) # pandas interpolation fills NaN values with the mean of two upper and lower neighbour values.
train_df.isnull().sum()

# Feature Engineering

**Now, we are going to create new features from existing features.**

**Creating new features by combining existing features will help us to represent our dataset better.**

**Here I just added 6 new features.**

In [None]:
def features(df):
    
    close = df['Close']
    volume = df['Volume']
    opening = df['Open']
    high = df['High']
    low = df['Low']
    
    
    df['EMA'] = talib.EMA(close, timeperiod=30) # Exponential Moving Average
    df['SMA'] = talib.SMA(close, timeperiod=30) # Simple Moving Average
    df['RSI'] = talib.RSI(close, timeperiod=14) # Relative Strength Index
    df['STDDEV'] = talib.STDDEV(close, timeperiod=5, nbdev=1) # Standard Deviation
    df['macd'], df['macdsignal'], df['macdhist'] = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9) # Moving Average Convergence/Divergence
    df['upperband'], df['middleband'], df['lowerband'] = talib.BBANDS(close, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0) # Bollinger Bands
    return df

**Here, we are adjusting the close price according to Adjustment Factor. It is important, because adjustment factor is not comprehensive for our model.**

In [None]:
# https://www.kaggle.com/code/smeitoma/train-demo

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price



In [None]:
train_df = adjust_price(train_df)
train_df = train_df.groupby('SecuritiesCode').apply(features)
train_df = train_df.dropna(axis=0).reset_index(drop=True) # drop
train_df

Nice.

# LGBM Model Implementation and Time Series Cross Validation Implementation

**My Loss Metric is MSE(Mean Squared Error), it is your optional choice.**

**Optuna Results:**

Phase 1 = {'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = 0.537631972137107.

Phase 2 = {'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = 0.537631972137107.

In [None]:
def split_group(df):
    """Splits groups by giving them unique labels."""
    df['fold'] = 0
    num_split = len(df)//CFG.folds
    next_val = 0
    for i in range(1, CFG.folds+1):
        df['fold'].iloc[num_split*next_val:num_split*(i)] = (i)
        if (i) > next_val:
            next_val = (i)
    df.fold.loc[df['fold'] == 0] = CFG.folds
    return df

def LGBM_Model(X_train, y_train, X_val, y_val, i):
    if CFG.Debug:
        model = LGBMRegressor()
    else:
        model = LGBMRegressor(num_leaves= CFG.num_leaves, learning_rate = CFG.LR, n_estimators = CFG.n_estimators, max_bin=CFG.max_bin)
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, preds))
    print(f'{i}. Fold MSE: {score:.5f}')
    return model, score

def perc(num, percent):
    """Rounded percent calculator"""
    return round(float(num/100)*percent)

def tscv(raw_df):
    """Creates a time series cross validation method, you can specify your model and percentage."""
    df = raw_df.copy()
    df = split_group(df)
    models = []
    errs = []
    #last_num = 0
    for i in range(1, CFG.folds+1):
        
        rest_df = df.loc[df['fold'] <= i]
        tmp_df = df.loc[df['fold'] == i]
        #print(tmp_df)
        rest_df = rest_df.sort_values("Date", ascending=True)
        tmp_df = tmp_df.sort_values("Date", ascending=True)
        #print(rest_df)
        

        X_train = rest_df[0:-perc(len(tmp_df), CFG.val_ratio)]
        y_train = rest_df[0:-perc(len(tmp_df), CFG.val_ratio)]
        

        X_val = tmp_df[-perc(len(tmp_df), CFG.val_ratio):-1] 
        y_val = tmp_df[-perc(len(tmp_df), CFG.val_ratio):-1]
        
        X_train = X_train[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']] # we include only these parameters.
        y_train = y_train[['Target']]
        
        X_val = X_val[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']]
        y_val = y_val[['Target']]
        
        model, err = LGBM_Model(X_train, y_train, X_val, y_val, i)
        errs.append(err)
        models.append(model)
        if len(errs) == CFG.folds:
            print('\n')
            print('-'*30)
            print(f'\nAverage MSE is: {np.mean(errs):.5f}')
            
        #if i == 3:
         #   break
    return models

            
models = tscv(train_df)

# Submit by Competition API

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
for (stock_prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    
    stock_prices = adjust_price(stock_prices)
    X_test = stock_prices[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']]
    preds = []
    for model in models:
        preds.append(model.predict(X_test))
    preds = np.mean(preds, axis=0) if CFG.mean else np.median(preds, axis=0)

    sample_prediction["Prediction"] = preds
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)