In [1]:
import datetime
import requests
from lxml import etree
import pandas as pd
import akshare as ak
import time
from itertools import combinations

In [2]:
date = ak.tool_trade_date_hist_sina()
date =date.loc[date['trade_date']>='2015-04-16']

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import shap

from tensorflow.keras import models, layers

In [4]:
df = pd.read_csv("SSE50_2010_2021.csv")

In [5]:
df = df[['amount','volume','low','trade_date','open','change','high','close']]

In [6]:
df['return'] = df['close'].pct_change()

In [7]:
df

Unnamed: 0,amount,volume,low,trade_date,open,change,high,close,return
0,3.646852e+10,2.732744e+09,2514.2370,2010-01-04,2565.1080,-39.1540,2570.1520,2514.6460,
1,5.077912e+10,3.537744e+09,2487.0480,2010-01-05,2526.2910,29.3450,2560.6670,2543.9910,0.011670
2,4.344433e+10,2.964615e+09,2513.3650,2010-01-06,2538.2850,-29.9770,2549.5710,2514.0140,-0.011783
3,4.461141e+10,3.204318e+09,2450.4020,2010-01-07,2516.0570,-50.2560,2527.4270,2463.7580,-0.019990
4,3.233506e+10,2.268341e+09,2432.0130,2010-01-08,2453.0610,2.4070,2470.0240,2466.1650,0.000977
...,...,...,...,...,...,...,...,...,...
2767,9.658566e+10,3.455940e+09,3643.9848,2021-05-26,3656.3403,15.8980,3678.0671,3657.1898,0.004366
2768,8.514700e+10,2.987912e+09,3628.9596,2021-05-27,3645.4823,7.7519,3709.0023,3664.9417,0.002120
2769,8.046883e+10,2.868285e+09,3630.5739,2021-05-28,3663.8753,-5.6460,3689.2479,3659.2957,-0.001541
2770,7.791793e+10,2.632840e+09,3615.4225,2021-05-31,3652.8877,-9.0712,3652.8924,3650.2245,-0.002479


In [8]:
def generate_ma(df,window_size = 10):
    new_df = df.copy()
    for i in range(new_df.shape[0]-window_size):
        price_list = new_df.iloc[i:i + window_size]['close']
        avg = np.average(price_list)
        new_df.loc[new_df.index == i + window_size,'{}_MA'.format(window_size)] = avg
    return new_df
    

In [9]:
def generate_ema(df,window_size = 10):
    new_df = df.copy()
    weights = np.arange(window_size) + 1
    for i in range(new_df.shape[0] - window_size):
        price_list = new_df.iloc[i:i + window_size]['close']
        ema = np.sum(weights*price_list)/np.sum(weights)
        new_df.loc[new_df.index == i + window_size,'{}_EMA'.format(window_size)] = ema
    return new_df

In [10]:
def generate_sma(df, coef, window_size = 10):
    new_df = df.copy()
    for i in range(new_df.shape[0] - window_size):
        price_list = new_df.iloc[i:i + window_size]['close']
        price_list = np.array(price_list)
        Y = price_list[0]
        for j in range(1,window_size):
            Y_new = (price_list[j]*coef + (window_size - coef)* Y)/window_size
            Y = Y_new
        sma = Y
        new_df.loc[new_df.index == i + window_size,'{}_SMA'.format(window_size)] = sma
    return new_df

In [11]:
def generate_data(df,window_size,features,rolling_step,return_lagging):
    data_size = df.shape[0]
    x_index = 0
    label_index = window_size + return_lagging - 1
    X = []
    y = []
    while label_index < data_size:
        x_df = df.iloc[x_index:x_index + window_size][features].T
        x_data = x_df.values
        y_data = df.iloc[label_index]['IndexLogDailyReturn']
        X.append(x_data)
        y.append(y_data)
        label_index += rolling_step
        x_index += rolling_step
    X = np.array(X)
    y = np.array(y)
    return X,y

In [12]:
def generate_rsi(df, window_size = 14):
    new_df = df.copy()
    for i in range(new_df.shape[0] - window_size):
        price_list = new_df.iloc[i:i + window_size]['close']
        diff = np.diff(np.array(price_list))
        abs_diff = np.abs(diff)
        pos_diff = diff
        pos_diff[pos_diff < 0] = 0
        Y_abs = abs_diff[0]
        Y_pos = pos_diff[0]
        for j in range(1,window_size-1):
            Y_new_abs = (abs_diff[j] * 1 + (window_size - 1)* Y_abs)/window_size
            Y_abs = Y_new_abs
            Y_new_pos = (pos_diff[j] * 1 + (window_size - 1)* Y_pos)/window_size
            Y_pos = Y_new_pos
        rsi = Y_pos/Y_abs * 100
        new_df.loc[new_df.index == i + window_size,'{}_RSI'.format(window_size)] = rsi
    return new_df
        
        

In [13]:
def generate_vwap(df):
    _open = df['open']
    _high = df['high']
    _low = df['low']
    _volume = df['volume']
    cum_volume = np.cumsum(_volume)
    _average = (_open + _high + _low)/3
    nominator = _average * _volume
    cum_nominator = np.cumsum(nominator)
    VWAP = cum_nominator / cum_volume
    VWAP = pd.Series(VWAP)
    new_df = pd.concat((df, VWAP.rename('VWAP')),axis = 1)
    return new_df

In [14]:
df.iloc[10:100]

Unnamed: 0,amount,volume,low,trade_date,open,change,high,close,return
10,4.186666e+10,3.093634e+09,2398.778,2010-01-18,2414.893,-6.619,2421.890,2421.188,-0.002726
11,3.754464e+10,2.782908e+09,2424.161,2010-01-19,2424.469,11.525,2450.281,2432.713,0.004760
12,4.572057e+10,3.474848e+09,2356.161,2010-01-20,2436.647,-73.228,2439.468,2359.485,-0.030101
13,3.780439e+10,2.919615e+09,2346.148,2010-01-21,2362.597,18.268,2391.508,2377.753,0.007742
14,4.904208e+10,4.063927e+09,2317.481,2010-01-22,2344.973,2.302,2398.989,2380.055,0.000968
...,...,...,...,...,...,...,...,...,...
95,1.434550e+10,1.385330e+09,1948.403,2010-05-26,1962.926,-0.515,1974.787,1957.943,-0.000263
96,2.013428e+10,1.899476e+09,1927.671,2010-05-27,1952.966,25.009,1987.882,1982.952,0.012773
97,1.943545e+10,1.859205e+09,1964.408,2010-05-28,2001.195,-9.330,2005.398,1973.622,-0.004705
98,1.744814e+10,1.757968e+09,1924.893,2010-05-31,1961.699,-47.513,1975.587,1926.109,-0.024074


In [15]:
df = generate_vwap(df)

In [16]:
df = generate_ma(df,10)

In [17]:
df = generate_ema(df,10)

In [18]:
df = generate_sma(df,3,10)

In [19]:
df = generate_rsi(df,10)

In [21]:
df = df.iloc[10:]
df.set_index('trade_date', inplace = True)
df = df.T

In [22]:
df.loc['amount'] = df.loc['amount']/1000000
df.loc['volume'] = df.loc['volume']/1000000

In [23]:
df.columns

Index(['2010-01-18', '2010-01-19', '2010-01-20', '2010-01-21', '2010-01-22',
       '2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28', '2010-01-29',
       ...
       '2021-05-19', '2021-05-20', '2021-05-21', '2021-05-24', '2021-05-25',
       '2021-05-26', '2021-05-27', '2021-05-28', '2021-05-31', '2021-06-01'],
      dtype='object', name='trade_date', length=2762)

In [24]:
# rtdays=5 means the target value would be the 5-day return
def generate_samples(data, rtdays=5, width=30, strides=1):
    dates = []
    samples = []
    targets = []
    
    i = data.shape[1] - 1 - rtdays
    
    while i - width + 1 >= 0:
        rt = (data.iloc[6,i+rtdays] - data.iloc[6,i]) / data.iloc[6,i]
        
        dates.append(data.columns[i])
        samples.append(data.iloc[:,i-width+1:i+1])
        targets.append(rt)
        
        i -= strides
    
    dates.reverse()
    samples.reverse()
    targets.reverse()
    
    return dates, samples, targets

In [25]:
dates, samples, targets = generate_samples(df, strides=2)

In [27]:
def ts_corr(X, Y, d=2, s=1):
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        Ysub = Y[i-d+1:i+1]
        corr = np.corrcoef(Xsub, Ysub)
        corr[np.isnan(corr)] = 0
        corr = corr[0,1]
        
        res.append(corr)
        
        i -= s
    return np.array(res, dtype='float32')
def ts_cov(X, Y, d=2, s=1):
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        Ysub = Y[i-d+1:i+1]
        cov = np.cov(Xsub, Ysub)[0,1]
        res.append(cov)
        
        i -= s   
    return np.array(res, dtype='float32')
def ts_stdev(X, d=2, s=1):
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        stdev = np.std(Xsub)
        res.append(stdev)
        
        i -= s
    return np.array(res, dtype='float32')
def ts_zscore(X, d=2, s=1):  #Note, will will return inf if RSI maintain the same in the rolling window.
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        zscore = np.mean(Xsub) / np.std(Xsub)
        res.append(zscore)
        
        i -= s  
    return np.array(res, dtype='float32')
def ts_return(X, d=2, s=1):   #Note, it will return inf if RSI once hit 0
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        ret = (Xsub[-1] - Xsub[0]) / Xsub[0]
        res.append(ret)
        
        i -= s
    return np.array(res, dtype='float32')
def ts_decaylinear(X, d=2, s=1):
    res = []
    i = len(X) - 1
    
    while i - d + 1 >= 0:
        Xsub = X[i-d+1:i+1]
        Wsub = np.array([i for i in range(1, d+1)])
        weighted_sum = np.sum(Xsub * Wsub / np.sum(Wsub))
        res.append(weighted_sum)
        
        i -= s
    return np.array(res, dtype='float32')

In [28]:
def generate_syn_feats(feats, fn_names):
    syn_feats = []
    
    for fn in fn_names:
        if fn in ['ts_corr', 'ts_cov']:
            for r1, r2 in combinations(range(len(feats)), 2):
                f1 = feats[r1]
                f2 = feats[r2]
                syn_feats.append(f"N({fn}({f1},{f2}))")
        else:
            for feat in feats:
                syn_feats.append(f"N({fn}({feat}))")
    
    return syn_feats

In [29]:
features = df.index.values

In [30]:
features

array(['amount', 'volume', 'low', 'open', 'change', 'high', 'close',
       'return', 'VWAP', '10_MA', '10_EMA', '10_SMA', '10_RSI'],
      dtype=object)

In [31]:
syn_feats = generate_syn_feats(
    features,
    ["ts_corr", "ts_cov", "ts_stdev", "ts_zscore", "ts_return", "ts_decaylinear"]
)

In [32]:
syn_feats

['N(ts_corr(amount,volume))',
 'N(ts_corr(amount,low))',
 'N(ts_corr(amount,open))',
 'N(ts_corr(amount,change))',
 'N(ts_corr(amount,high))',
 'N(ts_corr(amount,close))',
 'N(ts_corr(amount,return))',
 'N(ts_corr(amount,VWAP))',
 'N(ts_corr(amount,10_MA))',
 'N(ts_corr(amount,10_EMA))',
 'N(ts_corr(amount,10_SMA))',
 'N(ts_corr(amount,10_RSI))',
 'N(ts_corr(volume,low))',
 'N(ts_corr(volume,open))',
 'N(ts_corr(volume,change))',
 'N(ts_corr(volume,high))',
 'N(ts_corr(volume,close))',
 'N(ts_corr(volume,return))',
 'N(ts_corr(volume,VWAP))',
 'N(ts_corr(volume,10_MA))',
 'N(ts_corr(volume,10_EMA))',
 'N(ts_corr(volume,10_SMA))',
 'N(ts_corr(volume,10_RSI))',
 'N(ts_corr(low,open))',
 'N(ts_corr(low,change))',
 'N(ts_corr(low,high))',
 'N(ts_corr(low,close))',
 'N(ts_corr(low,return))',
 'N(ts_corr(low,VWAP))',
 'N(ts_corr(low,10_MA))',
 'N(ts_corr(low,10_EMA))',
 'N(ts_corr(low,10_SMA))',
 'N(ts_corr(low,10_RSI))',
 'N(ts_corr(open,change))',
 'N(ts_corr(open,high))',
 'N(ts_corr(open

In [33]:
# original program may genreate nan value when take corr between RSI and any other features if RSI is all 100 for that perios
# this situation happened around 2018 when the index futures price keep increasing for half a month
# according to the definision of RSI, it will result RSI to be maintained at 100 for several days.
# if we accidentally hit that period and use an array of all 100 to calculate corr, result will be nan

In [34]:
train_data = np.array(samples[:1000], dtype='float32')
test_data = np.array(samples[1000:], dtype='float32')
train_targets = np.array(targets[:1000], dtype='float32')
test_targets = np.array(targets[1000:], dtype='float32')

In [35]:
# we can see that originally only samples from 957 to 975 has null value inside
# after we change the code, we got non null values

In [36]:
def clean_array(t1):
    for i in range(len(t1)):  # 遍历每一行
        temp_row = t1[i]  # 当前的一行
        finite_num = np.sum(np.isfinite(temp_row))
        if finite_num != len(temp_row):  
            temp_not_infnull_row = temp_row[np.isfinite(temp_row)]
            #temp_not_nan_row = temp_row[temp_row == temp_row]  # 去掉nan的ndarray
            sub = temp_not_infnull_row.max()
            temp_row[np.isnan(temp_row)] = sub
            temp_row[np.isinf(temp_row)] = sub
    return t1

def generate_syn_samples(train_data, test_data, width=2, strides=1):
    feomap = { 'train': [], 'test': [] }
    fefns = (ts_corr, ts_cov, ts_stdev, ts_zscore, ts_return, ts_decaylinear)
    nfeats = train_data[0].shape[0]
    
    for sample in train_data:
        feoutput = []
        
        for fefn in fefns:
            if fefn in [ts_corr, ts_cov]:
                for r1, r2 in combinations(range(nfeats), 2):
                    res = fefn(sample[r1], sample[r2], d=width, s=strides)                
                    feoutput.append(res)
            else:
                for row in sample:
                    res = fefn(row, d=width, s=strides)
                    feoutput.append(res)
        feoutput = clean_array(feoutput)            
        feomap['train'].append(feoutput)
    print('train_done')   
    for sample in test_data:
        feoutput = []
        
        for fefn in fefns:
            if fefn in [ts_corr, ts_cov]:
                for r1, r2 in combinations(range(nfeats), 2):
                    res = fefn(sample[r1], sample[r2], d=width, s=strides)                  
                    feoutput.append(res)
            else:
                for row in sample:
                    res = fefn(row, d=width, s=strides)
                    feoutput.append(res)
        feoutput = clean_array(feoutput)  
        feomap['test'].append(feoutput)
    print('test_done')
    
    feomap['train'] = np.array(feomap['train'], dtype='float32')
    feomap['test'] = np.array(feomap['test'], dtype='float32')
    
    print(feomap['train'])
    print(feomap['test'])
    
    # Normalize
    nrows, ncols = feomap['train'][0].shape
    
    for i in range(nrows):
        for j in range(ncols):
            arr = []
            
            for sample in feomap['train']:
                arr.append(sample[i,j])
                
            arr_mean = np.mean(arr)
            arr_stdev = np.std(arr)
            
            for sample in feomap['train']:
                sample[i,j] = (sample[i,j] - arr_mean) / arr_stdev
                
            for sample in feomap['test']:
                sample[i,j] = (sample[i,j] - arr_mean) / arr_stdev

    return feomap

In [37]:
syn_samples_dict = generate_syn_samples(train_data, test_data, width=10, strides=3)

divide by zero encountered in float_scalars
invalid value encountered in true_divide
invalid value encountered in true_divide
divide by zero encountered in float_scalars


train_done
test_done
[[[ 9.22527075e-01  9.32895720e-01  9.44120467e-01 ...  9.93889153e-01
    9.89142299e-01  9.94122088e-01]
  [-2.08740845e-01 -2.29130000e-01 -6.17094943e-03 ... -5.05093396e-01
    1.43848822e-01  4.44946826e-01]
  [-8.94890651e-02 -1.85409501e-01  1.59743756e-01 ... -5.41329645e-02
    4.02445644e-01  4.85285997e-01]
  ...
  [ 2.27476440e+03  2.26222119e+03  2.24979810e+03 ...  2.25074536e+03
    2.28084912e+03  2.32618677e+03]
  [ 2.27778833e+03  2.26606226e+03  2.25210107e+03 ...  2.24700366e+03
    2.27159180e+03  2.30987915e+03]
  [ 5.27983742e+01  6.10389633e+01  4.49969521e+01 ...  2.86707478e+01
    2.48872528e+01  2.65176811e+01]]

 [[ 9.87651825e-01  9.13489521e-01  9.36153650e-01 ...  9.92388248e-01
    9.93395865e-01  9.78216648e-01]
  [-2.46351108e-01 -2.20788136e-01 -1.77538544e-01 ... -4.85538125e-01
   -3.51534903e-01 -1.59451105e-02]
  [-1.49121955e-01 -1.36270344e-01 -1.08581506e-01 ... -2.78065294e-01
    1.86142474e-01  2.67822593e-01]
  ...
  

In [42]:
pd.isnull(syn_samples_dict['train']).any()

False

In [38]:
from tensorflow.keras.utils import plot_model
from keras import models
from keras import layers

In [39]:
modelDNN = None

input_shape = syn_samples_dict['train'][0].shape

num_features_1 = 30
num_features_2 = 10

modelDNN = models.Sequential()
modelDNN.add(layers.Flatten(input_shape = input_shape))
modelDNN.add(layers.Dense(num_features_1, kernel_initializer='normal', activation = tf.nn.relu, name = 'dense_1'))
modelDNN.add(layers.Dense(num_features_1, kernel_initializer='normal', activation=tf.nn.relu, name="dense_2"))
modelDNN.add(layers.Dropout(0.2))
modelDNN.add(layers.Dense(num_features_2, kernel_initializer='normal', activation=tf.nn.relu, name="dense_3"))
modelDNN.add(layers.Dense(num_features_2, kernel_initializer='normal', activation=tf.nn.relu, name="dense_4"))
modelDNN.add(layers.Dropout(0.2))

modelDNN.add(layers.Dense(1, kernel_initializer='normal', activation = 'linear' ,name = "dense_head"))

modelDNN.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 1456)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                43710     
_________________________________________________________________
dense_2 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout (Dropout)            (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                310       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                110       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0

In [40]:
modelDNN.compile(optimizer = 'rmsprop', loss = 'mse', metrics = ['mae'])
modelDNN.fit(syn_samples_dict['train'], train_targets,batch_size = 128, epochs=20, validation_data=(syn_samples_dict['test'], test_targets))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9fdecec450>

In [43]:
import shap