In [None]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

# Load and prepare data

In [None]:
data_path = "/kaggle/input/jpx-csv-to-pickle/"

In [None]:
stock_prices = pd.read_pickle(f"{data_path}/train_files/stock_prices.p")
stock_prices.head()

In [None]:
# to short form
stock_prices_close = stock_prices.pivot("Date","SecuritiesCode","Close")
stock_prices_target = stock_prices.pivot("Date","SecuritiesCode","Target")

print(f"\nThere are {len(stock_prices_target)} timesteps.\n\n")
stock_prices_target.head()

# Check target computation

In [None]:
# let's pick a random stock
stock = 1301

trace = [
    go.Scatter(
        x = stock_prices_close.index,
        y = stock_prices_close[stock].pct_change(1),#.shift(-2),
        name = "Close pct change (1)",
    ),
    go.Scatter(
        x = stock_prices_target.index,
        y = stock_prices_target[stock],
        name = "Target",
    ),
]
fig = go.Figure(trace, dict(title=f"stock {stock}"))
fig.show()

Target is the Close percentage change shifted by two days, as expected.

# Visualize a bunch of stocks

In [None]:
# let's print the Close price for a bunch of stocks
np.random.seed(123)

n = 10
sel_stocks = np.random.choice(stock_prices_close.columns,n)

trace = []
for stock in sel_stocks:
    
    trace.append(
        go.Scatter(
            x = stock_prices_close.index,
            y = stock_prices_close[stock],
            name = f"Close {stock}",
        )
    )
    

fig = go.Figure(trace, dict(title=f"Close price for a sample of {n} stocks"))
fig.show()

Seems like adjustment is quite relevant. Let's adjust the prices.

# Check AdjustmentFactor

In [None]:
stock_prices_adjfactor = stock_prices.pivot("Date","SecuritiesCode","AdjustmentFactor")

In [None]:
# let's pick some stocks that need adjustment
sel_stocks = [8928,1973]

for stock in sel_stocks:
    trace = [
        go.Scatter(
            x = stock_prices_adjfactor.index,
            y = stock_prices_adjfactor[stock],
            name = "Adjustment factor",
        ),
        go.Scatter(
            x = stock_prices_close.index,
            y = stock_prices_close[stock] / 1000,
            name = "Close price / 1000",
        ),
    ]
    fig = go.Figure(trace, dict(title=f"check adjustment stock {stock}"))
    fig.show()

We see that the adjustment is relative only to the days of the change. We need to keep this into account when building lagged features!

# Missing data

Note that, since we switch to short format, we generated missing data from rows that simply don't exist.

In [None]:
missing_data = stock_prices_target.isnull().mean(axis=1)

trace = [
    go.Scatter(
        x = stock_prices_target.index,
        y = missing_data,
    ),
]
fig = go.Figure(trace, dict(title=f"Percentage missing stocks in time"))
fig.show()

- 6.8% of stocks are missing at the first timestep.
- all the stocks are available in the last 232 timesteps.