In [1]:
import os
os.chdir('..')
print('working dir:', os.getcwd())

import sys
import pandas as pd
import numpy as np
import polars as pl
import datetime

from feature_utils import ema, p_change, zscore, las, featclip, sanitise

raw_data_path = '/mnt/datassd2/crypto/15minbar'  
data_save_path = '/mnt/datassd2/crypto/dl_data/'
df = pl.scan_parquet(raw_data_path).collect()

# 实盘：结合Currency_picking.ipynb 筛选了近期活跃的symbol
used_cols = ['DOTUSDT', 'SEIUSDT', 'TRXUSDT', 'AUCTIONUSDT', 'ACHUSDT', 'INJUSDT', 'ARBUSDT', 'XRPUSDT', 'APTUSDT', '1000BONKUSDT', 'ARKUSDT', 'DOGEUSDT', 'AAVEUSDT', 'CAKEUSDT', 'UNIUSDT', 'GALAUSDT', 'ARKMUSDT', '1000SHIBUSDT', 'JTOUSDT', 'NEARUSDT', 'ETCUSDT', 'CRVUSDT', 'XLMUSDT', 'RUNEUSDT', 'ETHUSDT', 'SANDUSDT', 'FILUSDT', 'APEUSDT', 'LDOUSDT', 'AVAXUSDT', 'ZENUSDT', 'FETUSDT', 'BAKEUSDT', 'EOSUSDT', 'BNBUSDT', 'ORDIUSDT', 'MKRUSDT', 'SUIUSDT', '1000SATSUSDT', 'BTCUSDT', 'BCHUSDT', 'ALGOUSDT', 'HBARUSDT', 'OPUSDT', '1000FLOKIUSDT', 'ADAUSDT', 'ATOMUSDT', 'WLDUSDT', 'TIAUSDT', '1000PEPEUSDT']

working dir: /home/yuheng/mydata_mgmt/nerv_ml/tools


In [2]:
# 实盘需要cut最新数据
df = df.filter(pl.col('datetime') > datetime.datetime(2024, 1, 1))
df = df.sort(by='datetime')

In [None]:
len(df['symbol'].unique())

401

In [4]:
raw_pldf = df.with_columns(
    ema('close', 100).over('symbol').alias('ema'),
    p_change('close', 1).over('symbol').alias('p_change'),
    zscore('close', 100).over('symbol').alias('zscore'),
    ema('amount', 100).over('symbol').alias('ema_amt'),
    p_change('amount', 1).over('symbol').alias('p_change_amt'),
    zscore('amount', 100).over('symbol').alias('zscore_amt')
    )

pldf = raw_pldf.filter(pl.col('amount') > 0)

feats = ['ema', 'p_change', 'zscore', 'ema_amt', 'p_change_amt', 'zscore_amt']
cols = feats + las + ['symbol', 'datetime', 'date', 'close']
pldf = pldf.select(cols)
pldf = pldf.unique(subset=["symbol", "datetime"], maintain_order=True)

In [5]:
pldf = pldf.filter(pl.col('symbol').is_in(used_cols))

pldf = pldf.drop_nulls()

pldf = pldf.with_columns(
    pl.concat_list(feats).alias("feats")
)
pivot_feats = pldf.pivot(on="symbol",index="datetime", values="feats", maintain_order=True)

## 保存数据

1. eod_data(cs_data)
- 处理 eod_data (S,T,F) = (n_stock, n_datetime, n_feats)  
- 维护一个字典，映射 S,T,F 到对应的股票代码/日期/特征，后续预测值通过这个字典可以反向映射回原dataframe

2. mask_data
- 把eod == null的处理为mask

3. gt_data(return_data)
- 取la15存起来就行
4. extra_data 其他数据
- symbols 顺序
- dt 顺序

In [6]:
exp_symbols = pivot_feats.columns
exp_symbols.remove('datetime')
exp_dt = pivot_feats['datetime']
extra_data = {
    'symbols': exp_symbols,
    'dt': exp_dt.to_list()
}

pivot_feats = pivot_feats.drop('datetime')

In [7]:
# task2. 得到mask_data
mask_data = pivot_feats.with_columns(
    pl.col(s).is_null().not_() for s in exp_symbols
).to_numpy()
mask_data = mask_data.T

In [8]:
# 填充null
pivot_feats = pivot_feats.with_columns(
    pl.col(s).fill_null([0.] * len(feats)) for s in exp_symbols
)
# task1. 得到cs_feats_data, 
feats_numpy = pivot_feats.to_numpy()
feats_array = np.array([np.stack(sublist, axis=1) for sublist in feats_numpy])
feats_array = feats_array.transpose(2, 0, 1)   # shape: (n_symbols, n_T,  n_feats)

In [9]:
feats_array.shape

(50, 39308, 6)

In [10]:
# task3 得到la_data
la_data =  pldf.pivot(on="symbol",index="datetime", values="la60", maintain_order=True)
la_data = la_data.drop('datetime')
la_array = la_data.fill_null(0.).to_numpy()
la_array = la_array.T

In [11]:
print('feats_array.shape:', feats_array.shape)
print('mask_data.shape:', mask_data.shape)
print('la15_array.shape:', la_array.shape)

feats_array.shape: (50, 39308, 6)
mask_data.shape: (50, 39308)
la15_array.shape: (50, 39308)


In [12]:
dataset_path = data_save_path + 'stock_mix_data'
# mkdir
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)
print('save to:', dataset_path)

save to: /mnt/datassd2/crypto/dl_data/stock_mix_data


In [13]:
import pickle

with open(os.path.join(dataset_path, "cs_data.pkl"), 'wb') as f:
    pickle.dump(feats_array, f)
with open(os.path.join(dataset_path, "mask_data.pkl"), 'wb') as f:
    pickle.dump(mask_data, f)
with open(os.path.join(dataset_path, "la15_data.pkl"), 'wb') as f:
    pickle.dump(la_array, f)
with open(os.path.join(dataset_path, "extra_data.pkl"), 'wb') as f:
    pickle.dump(extra_data, f)

In [14]:
print('extra_data.keys(): ', extra_data.keys())
print('len(extra_data[\'dt\']): ', len(extra_data['dt']))
print('len(extra_data[\'symbols\']): ', len(extra_data['symbols']))

extra_data.keys():  dict_keys(['symbols', 'dt'])
len(extra_data['dt']):  39308
len(extra_data['symbols']):  50


## 预处理MLP用的截面数据

In [15]:
import numpy as np
import os
import pickle
import polars as pl

df = pl.scan_parquet(raw_data_path)
df = df.filter(pl.col('datetime') > datetime.datetime(2024, 1, 1))
df = df.sort(by='datetime')

las = ['la1', 'la2', 'la5', 'la10', 'la15', 'la30', 'la60', 'la120', 'la180', 'la240', 'la300', 'la360']
df = df.with_columns([featclip(pl.col(la)).alias(la) for la in las])
df = df.with_columns(ema('close', 100).over('symbol').alias('ema'),)
df = df.collect().to_pandas()
df.drop_duplicates(subset=['datetime', 'symbol'], keep='first', inplace=True)
factors_df = df.pivot(index='datetime', columns='symbol', values='ema')
la_df = df.pivot(index='datetime', columns='symbol', values='la240')
factors_df.fillna(0, inplace=True)
la_df.fillna(0, inplace=True)
factors_df_top50 = factors_df[used_cols].copy()
la_df_top50 = la_df[used_cols].copy()
X = factors_df_top50
y = la_df_top50

exp_symbols = X.columns
exp_dt = X.index.values
extra_data = {
    'symbols': exp_symbols,
    'dt': exp_dt
}

In [16]:
dataset_path = data_save_path + 'cs_dataset'
# mkdir
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)
print('save to:', dataset_path)

save to: /mnt/datassd2/crypto/dl_data/cs_dataset


In [17]:
X.to_parquet(os.path.join(dataset_path, "X.parquet"))
y.to_parquet(os.path.join(dataset_path, "y.parquet"))
with open(os.path.join(dataset_path, "extra_data.pkl"), 'wb') as f:
    pickle.dump(extra_data, f)

In [18]:
X = pd.read_parquet(os.path.join(dataset_path, "X.parquet")).values

In [19]:
X.shape

(39455, 50)

In [20]:
y.shape

(39455, 50)