## Introduction

In this notebook, let's do anomaly detection on volume first.

Then check future price movement by aftermath plot!

## Exploratory Analysis

In [None]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse, stats

There is 1 csv file in the current version of the dataset:


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Let's check 1st file: /kaggle/input/all_japanese_stocks.csv

In [None]:
df1 = pd.read_csv('/kaggle/input/all_japanese_stocks.csv', delimiter=',')
df1.dataframeName = 'all_japanese_stocks.csv'
columns = df1.columns.to_list()
columns[0] = "code"
df1.columns = columns
df1["Date"] = pd.to_datetime(df1["Date"])
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df1.head(5)

In [None]:
df = df1.set_index(["code", "Date"])
df = df[(df["Volume"] > 0) & (df["Close"] < df["Close"].rolling(10).mean()*2)]
df["TurnOver"] = df["Close"]*df["Volume"]
print(df.shape)
df.head(5)

In [None]:
turnover = df[["TurnOver"]].unstack().T
turnover.index = turnover.index.get_level_values(1)
turnover = turnover.sort_index()
turnover = turnover.dropna(thresh=2000, axis=1).ffill().dropna()
turnover.head(5)

In [None]:
volume = df[["Volume"]].unstack().T
volume.index = volume.index.get_level_values(1)
volume = volume.sort_index()
volume = volume.dropna(thresh=2000, axis=1).ffill().dropna()
volume.head(5)

In [None]:
target_stocks = turnover.mean().nlargest(1000, keep='last')
target_stocks.index

In [None]:
close_price = df[["Close"]].unstack().T
close_price.index = close_price.index.get_level_values(1)
close_price = close_price.sort_index()
close_price = close_price.dropna(thresh=2000, axis=1).ffill().dropna()
first_date = close_price.index[0]
close_price = close_price[target_stocks.index]
close_price.head(5)

In [None]:
open_price = df[["Open"]].unstack().T
open_price.index = open_price.index.get_level_values(1)
open_price = open_price.sort_index()
open_price = open_price.dropna(thresh=2000, axis=1).ffill().dropna()
first_date = open_price.index[0]
open_price = open_price[target_stocks.index]
open_price.head(5)

# Anomaly Detection

In this notebook, let's try to use anomaly detection by refering https://www.kaggle.com/liubenyuan/time-series-and-anomaly-detection

Anomaly Detection (ad) Using hp filter and mad test

In [None]:
# Hodrick Prescott filter
def hp_filter(x, lamb=5000):
    w = len(x)
    b = [[1]*w, [-2]*w, [1]*w]
    D = sparse.spdiags(b, [0, 1, 2], w-2, w)
    I = sparse.eye(w)
    B = (I + lamb*(D.transpose()*D))
    return sparse.linalg.dsolve.spsolve(B, x)


def mad(data, axis=None):
    return np.mean(np.abs(data - np.mean(data, axis)), axis)


def AnomalyDetection(x, alpha=0.2, lamb=5000):
    """
    x         : pd.Series
    alpha     : The level of statistical significance with which to
                accept or reject anomalies. (expon distribution)
    lamb      : penalize parameter for hp filter
    return r  : Data frame containing the index of anomaly
    """
    # calculate residual
    xhat = hp_filter(x, lamb=lamb)
    resid = x - xhat

    # drop NA values
    ds = pd.Series(resid)
    ds = ds.dropna()

    # Remove the seasonal and trend component,
    # and the median of the data to create the univariate remainder
    md = np.median(x)
    data = ds - md

    # process data, using median filter
    ares = (data - data.median()).abs()
    data_sigma = data.mad() + 1e-12
    ares = ares/data_sigma

    # compute significance
    p = 1. - alpha
    R = stats.expon.interval(p, loc=ares.mean(), scale=ares.std())
    threshold = R[1]

    # extract index, np.argwhere(ares > md).ravel()
    r_id = ares.index[ares > threshold]

    return r_id

Sample usage of this method

In [None]:
np.random.seed(42)

# sample signals
N = 1024  # number of sample points
t = np.linspace(0, 2*np.pi, N)
y = np.sin(t) + 0.02*np.random.randn(N)

# outliers are assumed to be step/jump events at sampling points
M = 3  # number of outliers
for ii, vv in zip(np.random.rand(M)*N, np.random.randn(M)):
    y[int(ii):] += vv

# detect anomaly
r_idx = AnomalyDetection(y, alpha=0.1)

# plot the result
plt.figure()
plt.plot(y, 'b-')
plt.plot(r_idx, y[r_idx], 'ro')

So let's use this anomaly detection method to volume data.

In this notebook, we use some techniques such as Walkforward to calculate anomaly step by step.

In [None]:
from tqdm.auto import tqdm

y = volume[target_stocks.index[0]]

N = len(y)
num_data = 100
step = 1#N//10

r_idx = []
index = y.index

for n in tqdm(range(num_data, N-num_data)):
    _r_idx = AnomalyDetection(y[max(n-100, 0):n], alpha=0.1)
    if len(_r_idx) > 0:
        if _r_idx[-1] == index[n-1]:
            r_idx.append(_r_idx[-1])

fig, ax = plt.subplots(figsize=(15, 5))

ax.plot(y, color='b',linestyle='-', label="volume --code : {} --step : {}days".format(target_stocks.index[0], step))
ax.scatter(r_idx, y[r_idx], color='r', marker='o')
ax.grid(True)
ax.legend()

In [None]:
aftermath = []

price = open_price[target_stocks.index[0]]
for index in r_idx:
    aftermath.append(price[index:].iloc[1:31].values)

aftermath = pd.DataFrame(aftermath).T
aftermath /= aftermath.iloc[0]
print(aftermath.shape)
aftermath.head(5)

In [None]:
aftermath.plot(figsize=(15, 5), grid=True)

In [None]:
aftermath.mean(axis=1).plot(figsize=(15, 5), grid=True)

In [None]:
up_aftermath = []

_open_price = open_price[target_stocks.index[0]]
_close_price = close_price[target_stocks.index[0]]

for index in r_idx:
    if _close_price[index] > _open_price[index]:
        up_aftermath.append(price[index:].iloc[1:31].values)

up_aftermath = pd.DataFrame(up_aftermath).T
up_aftermath /= up_aftermath.iloc[0]
up_aftermath.plot(figsize=(15, 2), grid=True, alpha=0.5, legend=False)
plt.show()
up_aftermath.mean(axis=1).plot(figsize=(15, 2), grid=True, label="average", color="black")

In [None]:
down_aftermath = []

_open_price = open_price[target_stocks.index[0]]
_close_price = close_price[target_stocks.index[0]]

for index in r_idx:
    if _close_price[index] < _open_price[index]:
        down_aftermath.append(price[index:].iloc[1:31].values)

down_aftermath = pd.DataFrame(down_aftermath).T
down_aftermath /= down_aftermath.iloc[0]
down_aftermath.plot(figsize=(15, 2), grid=True, alpha=0.5, legend=False)
plt.show()
down_aftermath.mean(axis=1).plot(figsize=(15, 2), grid=True, label="average", color="black")

## Conclusion

Anomaly detection might be helpful to extract alpha.

As we see in the last graph, close price tends to go up for 30 days after there is an anomaly in volume.

Good Luck!