# Data Manipulator

Used to manipulate data and store them as intermediate result.

## Stock Pool

In [14]:
import pandas as pd

klines = pd.read_parquet('kline_daily/kline_daily.parquet')
# filter rises up and falls down
upstop = klines['limit_up'] - klines['high'] < 1e-2
downstop = klines['low'] - klines['limit_down'] < 1e-2
klines = klines[~downstop & ~upstop]
# filter money < 3e7
low_liquidity = klines['money'] < 3e7
klines = klines[~low_liquidity]
klines.reset_index()[['date', 'stock_code']].to_parquet('stock_pool/stock_liquid_updown.parquet')

## Forward Return

Caculate the forward return as label for latter use

### Close to Close Return

This type of return is not robust, and may serverely effect the model performance, use with caution

In [None]:
import pandas as pd

data = pd.read_parquet('./kline_daily/', columns=['close', 'adjfactor'])
close = data['close'] * data['adjfactor']
ret = (close.groupby(level=1).shift(-2) / close.groupby(level=1).shift(-1) - 1).dropna()
ret = ret.loc[~ret.index.duplicated(keep='first')]
nret = ret.groupby(level=0).apply(lambda x: (x - x.mean()) / x.std())
ret.to_frame(name='label').to_parquet('./intermediate/forward_return_1d_close_close.parquet')
nret.to_frame(name='label').to_parquet('./intermediate/forward_return_1d_close_close_normalized.parquet')

### VWAP Return

This type of return is more plausible, and more like the true market environment

In [3]:
import pandas as pd

data = pd.read_parquet('./derivative_indicators/vwap.parquet')
vret = data.groupby(level=1).shift(-2) / data.groupby(level=1).shift(-1) - 1
vret = vret.dropna()
vret.to_parquet('../data/intermediate/forward_return/1d_vwap_vwap.parquet')
nvret = vret.groupby(level=0).apply(lambda x: (x - x.mean()) / x.std())
nvret.to_parquet('../data/intermediate/forward_return/1d_vwap_vwap_normalized.parquet')

### Open Return

This type of return performs better, and we actually order at the open time

In [4]:
import pandas as pd

data = pd.read_parquet('./kline_daily/', columns=['open', 'adjfactor'])
popen = data['open'] * data['adjfactor']
oret = popen.groupby(level=1).shift(-2) / popen.groupby(level=1).shift(-1) - 1
oret = oret.dropna()
oret = oret.loc[~oret.index.duplicated(keep='first')]
oret.to_frame(name='label').to_parquet('../data/intermediate/forward_return/1d_open_open.parquet')
oret = oret.groupby(level=0).apply(lambda x: (x - x.mean()) / x.std())
oret.to_frame(name='label').to_parquet('../data/intermediate/forward_return/1d_open_open_normalized.parquet')