In [1]:
import pandas as pd
import numpy as np
import tushare as ts
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
tick_ori = ts.get_k_data('600313', start='2010-01-01', end='2018-01-23')

In [3]:
tick = pd.DataFrame.copy(tick_ori)
tick = tick[tick['high'] != tick['low']].reset_index().drop('index', axis=1)

In [4]:
def get_box(p_open, p_close, p_high, p_low):
    if p_high == p_low:
        p_high = max(p_open, p_close, p_high, p_low)
        p_low = min(p_open, p_close, p_high, p_low)
    max_range = (p_high - p_low)
    move = p_close - p_open
    top = min(p_high - p_open, p_high - p_close) / max_range
    middle = abs(p_open - p_close) / max_range
    bottom = min(p_close - p_low, p_open - p_low) / max_range
    return [move, top, middle, bottom]

In [5]:
tick['move'] = tick.apply(lambda x: get_box(x['open'], x['close'], x['high'], x['low'])[0], axis=1)
tick['top'] = tick.apply(lambda x: get_box(x['open'], x['close'], x['high'], x['low'])[1], axis=1)
tick['middle'] = tick.apply(lambda x: get_box(x['open'], x['close'], x['high'], x['low'])[2], axis=1)
tick['bottom'] = tick.apply(lambda x: get_box(x['open'], x['close'], x['high'], x['low'])[3], axis=1)
tick['median'] = tick.apply(lambda x: (x['high'] + x['close']) / 2, axis=1)

In [6]:
scaler = MinMaxScaler((0, 1))
tick['move_scaled'] = scaler.fit_transform(tick['move'].values.reshape(-1 , 1))
tick['median'] = scaler.fit_transform(tick['median'].values.reshape(-1 , 1))
tick['volume'] = scaler.fit_transform(tick['volume'].values.reshape(-1 , 1))

In [7]:
def get_frame(df, x_cols, y_cols, duration=10, forward=1, thread=0.01):
    tick = pd.DataFrame.copy(df)
    Xs = []
    ys = []
    for i in range(duration, len(tick) - forward + 1):
        h_x = tick.loc[i-duration: i-1, x_cols].as_matrix()
        forward_high = max(tick.loc[i: i + forward - 1, y_cols[0]])
        h_y = np.array(1 if (forward_high - tick.loc[i, y_cols[1]]) / tick.loc[i, y_cols[1]] >= thread else 0)
        Xs.append(h_x)
        ys.append(h_y)
    Xs = np.array(Xs)
    ys = to_categorical(np.array(ys), 2)
    return [Xs, ys]

In [8]:
x_cols = ['median', 'volume', 'move_scaled', 'top', 'middle', 'bottom']
# x_cols = ['move_scaled', 'middle']
# x_cols = ['move_scaled', 'top', 'middle', 'bottom']
y_cols = ['high', 'close']

thread = 0.09
duration = 20
forward = 20

X, y = get_frame(tick, x_cols, y_cols, thread=thread, duration=duration, forward=forward)
sum(y[:, 1]) / len(y)

0.50440081245768453

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
model = Sequential()
model.add(LSTM(16, input_shape=(duration, len(x_cols)), activation='relu', return_sequences=True))
# model.add(Dropout(0.5))
# model.add(LSTM(16, activation='relu', return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [11]:
model.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.2)

Train on 885 samples, validate on 222 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.History at 0x176d8b99588>

In [12]:
y_test_predicted = [np.argmax(i) for i in model.predict(X_test)]
y_test_actual = [np.argmax(i) for i in y_test]
tn, fp, fn, tp = confusion_matrix(y_test_actual, y_test_predicted).ravel()
print(tn, fp, fn, tp)
print("percision: %.4f"%(tp / (tp + fp)))
print("recall: %.4f"%(tp / (tp + fn)))

125 59 61 125
percision: 0.6793
recall: 0.6720
