In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from decimal import ROUND_HALF_UP, Decimal
from tqdm import tqdm
from lightgbm import LGBMRegressor


# Import dataframe

In [None]:
sectionProducts_encoder = LabelEncoder()
newMarketSegment_encoder = LabelEncoder()
def load_data() -> pd.DataFrame:
    # Import data
    folder = '../input/jpx-tokyo-stock-exchange-prediction'
    df_stock_price = pd.read_csv(os.path.join(folder, 'train_files', 'stock_prices.csv'), parse_dates=['Date'])
    df_stock_list = pd.read_csv(os.path.join(folder, 'stock_list.csv'))
    
    # Preprocess data
    df_stock_list['Name'] = [i.rstrip().lower().capitalize() for i in df_stock_list['Name']]
    df_stock_list['SectorName_17'] = [i.rstrip().lower().capitalize() for i in df_stock_list['17SectorName']]
    df_stock_list['SectorCode_17'] = [i for i in df_stock_list['17SectorCode']]
    df_stock_list['SectorCode_17'] = df_stock_list['SectorCode_17'].replace('-', '0').fillna('0')
    df_stock_list['SectorName_33'] = [i.rstrip().lower().capitalize() for i in df_stock_list['33SectorName']]
    df_stock_list['SectorCode_33'] = [i for i in df_stock_list['33SectorCode']]
    df_stock_list['SectorCode_33'] = df_stock_list['SectorCode_33'].replace('-', '0').fillna('0')
    df_stock_list['SectionProducts'] = df_stock_list['Section/Products'].fillna('Unknown')
    df_stock_list['SectionProductsCode'] = sectionProducts_encoder.fit_transform(df_stock_list.SectionProducts.values)
    df_stock_list['NewMarketSegment'] = df_stock_list['NewMarketSegment'].fillna('Unknown')
    df_stock_list['NewMarketSegmentCode'] = newMarketSegment_encoder.fit_transform(df_stock_list.NewMarketSegment.values)
    
    # Merge data
    df_stock_price = df_stock_price.merge(df_stock_list[['SecuritiesCode', 'Name', 'SectorCode_17', 'SectorCode_33', 'SectionProductsCode', 'NewMarketSegmentCode']], on = 'SecuritiesCode', how = 'left')
    df_stock_price[['SectorCode_17', 'SectorCode_33']] = df_stock_price[['SectorCode_17', 'SectorCode_33']].fillna('0').astype(int)
    df_stock_price.sort_values(['Date', 'SecuritiesCode'], inplace = True)
    
    return df_stock_price

# Feature Engineering
    - Adjusted Close
    - Return
    - Moving Average
    - Volatility
    - Market Impact

Adjusted Close from the host competition team [Train Demo](https://www.kaggle.com/code/smeitoma/train-demo?scriptVersionId=92137850&cellId=6).
New features based on [this work.](https://www.kaggle.com/code/wannabebotter/jpx-stock-market-analysis-prediction-with-lgbm?scriptVersionId=97874906&cellId=5)

In [None]:
def adjust_price(price: pd.DataFrame) -> pd.DataFrame:
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    #price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        # forward fill Open, High, Low
        df.loc[:, "Open"] = df.loc[:, "Open"].ffill()
        df.loc[:, "High"] = df.loc[:, "High"].ffill()
        df.loc[:, "Low"] = df.loc[:, "Low"].ffill()
        df.loc[:, "Volume"] = df.loc[:, "Volume"].ffill()
        
        return df
    
    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    
    return price

In [None]:
def average_true_range(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    a = highs - lows
    b = abs(highs - closes.shift(1))
    c = abs(lows - closes.shift(1))
    return pd.melt(pd.DataFrame(np.max([a,b,c], axis = 0) / closes.shift(1), index = a.index, columns = a.columns).rolling(window).mean(), ignore_index=False).reset_index().dropna().rename(columns = {"value":f"atr_{category}_{window}"})

def volatility(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    return pd.melt((closes.diff() / closes.shift(1)).rolling(window).std(), ignore_index = False).reset_index().dropna().rename(columns = {"value":f"volatility_{category}_{window}"})

def moving_average(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    return pd.melt(closes.rolling(window).mean(), ignore_index = False).reset_index().dropna().rename(columns = {"value":f"movingnAverage_{category}_{window}"})

def moving_average_vol(pivots, window, category: str = 'Security'):    
    closes, opens, highs, lows, volumes = pivots
    return pd.melt(volumes.rolling(window).mean(), ignore_index=False).reset_index().dropna().rename(columns = {"value":f"movingnAverageVolume_{category}_{window}"})

def moving_average_gap(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    return pd.melt((closes - closes.rolling(window).mean()) / closes.rolling(window).mean(), ignore_index = False).reset_index().dropna().rename(columns = {"value":f"movingAverageGap_{category}_{window}"})

def rate_of_return(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    return pd.melt(closes.pct_change(window), ignore_index = False).reset_index().dropna().rename(columns = {"value":f"return_{category}_{window}"})

def market_impact(pivots: any, window: int, category: str = 'Security') -> pd.DataFrame:
    closes, opens, highs, lows, volumes = pivots
    return pd.melt((closes.diff() / volumes).rolling(window).mean(), ignore_index = False).reset_index().dropna().rename(columns = {"value":f"marketImpact_{category}_{window}"})

def create_features(df: pd.DataFrame, with_new=False) -> pd.DataFrame:
    # return pct_change(period) per categories
    # moving average rolling(window=period).mean() per categories
    # volatility diff().rolling(period).std() per categories
    # average true range per categories
    # market impact per categories
    
    df = df.copy()
    categories = ['SecuritiesCode']
    for category in categories :
        closes = pd.pivot_table(df, values = "AdjustedClose", index = "Date", columns = category).ffill()
        opens = pd.pivot_table(df, values = "Open", index = "Date", columns = category).ffill()
        highs = pd.pivot_table(df, values = "High", index = "Date", columns = category).ffill()
        lows = pd.pivot_table(df, values = "Low", index = "Date", columns = category).ffill()
        volumes = pd.pivot_table(df, values = "Volume", index = "Date", columns = category).ffill()

        pivots = (closes, opens, highs, lows, volumes)

        windows = [2, 5, 10, 20, 40, 60]
        windows_bis = [1, 5, 10, 20]

        for func in [volatility, moving_average, moving_average_gap]:
            for window in tqdm(windows):
                df = pd.merge(df, func(pivots, window, category), on = ["Date",category], how = "left")

        for func in [market_impact, rate_of_return, average_true_range]:
            for window in tqdm(windows_bis):
                df = pd.merge(df, func(pivots, window, category), on = ["Date",category], how = "left")
            
    df = df.sort_values(['Date','SecuritiesCode']).dropna(axis = 0)
    return df

In [None]:
df = load_data()
df = adjust_price(df)
df.drop(["RowId", "AdjustmentFactor", "CumulativeAdjustmentFactor", "ExpectedDividend", "SupervisionFlag", "Close"], axis = 1, inplace = True)
df = create_features(df, with_new=True)

In [None]:
print('Training and making predictions')
params = {
    'n_estimators': 500,
    'num_leaves' : 100,
    'learning_rate': 0.1,
    'colsample_bytree': 0.9,
    'subsample': 0.8,
    'reg_alpha': 0.4,
    'metric': 'mae',
    'random_state': 42,
    'verbosity': 1}

def run_train_serie(df: pd.DataFrame, code: int = 1377, target: str = 'Target'):
    df = df.copy()
    df_serie = df[df['SecuritiesCode']==1377]
    date = '2020-12-10'
    if target == 'Diff_T1':
        df_serie['Diff_T1'] = df_serie['AdjustedClose'].shift(-1) - df_serie['AdjustedClose']
        df_serie = df_serie[:-1]
        col_to_drop = {'Target','Name','SecuritiesCode', target}
    elif target == 'Diff_T2':
        df_serie['Diff_T2'] = df_serie['AdjustedClose'].shift(-2) - df_serie['AdjustedClose']
        df_serie = df_serie[:-2]
        col_to_drop = {'Target','Name','SecuritiesCode', target}
    else:
        col_to_drop = {'Target','Name','SecuritiesCode'}
    # Fit with training date for submission
    X_train = df_serie[df_serie.Date < date][set(df_serie.columns.values) - {'Target', 'Name', 'SecuritiesCode', target}]
    y_train = df_serie[df_serie.Date < date][['Date', target]]
    X_test = df_serie[df_serie.Date >= date][set(df_serie.columns.values) - {'Target', 'Name', 'SecuritiesCode', target}]
    y_test = df_serie[df_serie.Date >= date][['Date', target]]

    print(f"Train Date range: {X_train.Date.min()} to {X_train.Date.max()}")
    print(f"Test Date range: {X_test.Date.min()} to {X_test.Date.max()}")
    print(f"Train size: {X_train.shape[0]} Test size {X_test.shape[0]}")
    
    X_train_nodate = X_train.drop('Date', axis = 1)
    X_test_nodate = X_test.drop('Date', axis = 1)
    y_train_nodate = y_train.drop('Date', axis = 1)
    y_test_nodate = y_test.drop('Date', axis = 1)

    gbm = LGBMRegressor(**params).fit(X_train_nodate, y_train_nodate, verbose =0)
    
    # Run against test period
    y_pred = gbm.predict(X_test_nodate)
    
    return y_test_nodate, y_pred, df_serie[df_serie.Date >= date]['AdjustedClose']

In [None]:
results = pd.DataFrame(columns = ['Prediction', 'RMSE', 'MAE'])

# Explicit prediction

## Predict C(t+1)-C(t)

In [None]:
target = 'Diff_T1'
y_test, y_pred, close = run_train_serie(df, 1377, target)

In [None]:
y_pred_1 = close.values + y_pred
y_true_1 = close.values + y_test[target].values
rmse = mean_squared_error(close.values + y_test[target].values, close.values + y_pred, squared=False)
mae = mean_absolute_error(close.values + y_test[target].values, close.values + y_pred)

results = results.append(pd.Series([target, rmse, mae], index=results.columns), ignore_index=True)

In [None]:
plt.figure(figsize=(12,5), facecolor='white')
plt.plot(close.values + y_test[target].values, label='y_true')
plt.plot(close.values + y_pred, label='y_pred')
plt.legend()
plt.show()

## Predict C(t+2)-C(t)

In [None]:
target = 'Diff_T2'
y_test, y_pred, close = run_train_serie(df, 1377, target)

In [None]:
y_pred_2 = close.values + y_pred
y_true_2 = close.values + y_test[target].values
rmse = mean_squared_error(close.values + y_test[target].values, close.values + y_pred, squared=False)
mae = mean_absolute_error(close.values + y_test[target].values, close.values + y_pred)

plt.figure(figsize=(12,5), facecolor='white')
plt.plot(close.values + y_test[target].values, label='y_true')
plt.plot(close.values + y_pred, label='y_pred')
plt.legend()
plt.show()
print(f'RMSE: {rmse} - MAE: {mae}')

In [None]:
rmse = mean_squared_error(close.values, close.values + y_pred, squared=False)
mae = mean_absolute_error(close.values, close.values + y_pred)
results = results.append(pd.Series([target, rmse, mae], index=results.columns), ignore_index=True)

## Predict explicit r(t) from C(t+1) and C(t+2)

In [None]:
roc_true = (y_true_2 - y_true_1[:-1])/y_true_1[:-1]
roc_pred = (y_pred_2 - y_pred_1[:-1])/y_pred_1[:-1]
rmse = mean_squared_error(roc_true, roc_pred, squared=False)
mae = mean_absolute_error(roc_true, roc_pred)
results = results.append(pd.Series(['Explicit', rmse, mae], index=results.columns), ignore_index=True)

In [None]:
roc_pred.shape

In [None]:
plt.figure(figsize=(12,5), facecolor='white')
plt.plot(roc_true, label='Target')
plt.plot(roc_pred, label='LGBM')
plt.plot(np.zeros(roc_true.shape), label='Naive', linestyle='--',c='k')
plt.legend()
plt.show()

# Predict implicit r(t)

In [None]:
y_test, y_pred, _ = run_train_serie(df)
rmse = mean_squared_error(y_test[:-2]['Target'].values, y_pred[:-2], squared=False)
mae = mean_absolute_error(y_test[:-2]['Target'].values, y_pred[:-2])

results = results.append(pd.Series(['Implicit', rmse, mae], index=results.columns), ignore_index=True)

In [None]:
plt.figure(figsize=(12,5), facecolor='white')
plt.plot(y_test['Target'].values, label='y_true')
plt.plot(y_pred, label='y_pred')
plt.legend()
plt.show()

# Naïve

In [None]:
# Add naïve approach where t+1 and t+2 = t => r(t) tends to 0.
y_naïve = np.zeros((y_pred.shape))
rmse = mean_squared_error(y_test[:-2]['Target'].values, y_naïve[:-2], squared=False)
mae = mean_absolute_error(y_test[:-2]['Target'].values, y_naïve[:-2])

results = results.append(pd.Series(['Naïve', rmse, mae], index=results.columns), ignore_index=True)

# Results

In [None]:
results

<p>
    This work have been done on only one security but even if the implicit method gives better results, it can be compared to the naïve approach.
</p>