In [1]:
import pandas as pd
import numpy as np
import tushare as ts
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical

Using TensorFlow backend.


In [87]:
def get_samples(df, x_cols, y_base='close', y_target='high', duration=5, forward=5, \
               threshold=0.01, x_matrix=True, scale_cols=None, show_portion=False):
    Xs = []
    ys = []
    dfo = pd.DataFrame.copy(df)
    if scale_cols != None and type(scale_cols) == list:
        scaler = MinMaxScaler((0, 1))
        for i in scale_cols:
            dfo[i] = scaler.fit_transform(dfo[[i]])
    for i in range(duration, len(dfo)-forward+1):
        X = dfo.loc[dfo.index[i-duration:i], x_cols]
        if x_matrix == True:
            X = X.as_matrix()
        y = 1 if (max(dfo.loc[dfo.index[i: i+forward], y_target]) - dfo.loc[dfo.index[i-1], y_base]) \
            / dfo.loc[dfo.index[i-1], y_base] >= threshold else 0
        Xs.append(X)
        ys.append(y)
    ys = to_categorical(ys, num_classes=2)
    if x_matrix == True:
        Xs = np.array(Xs)
    if show_portion == True:
        sum_positive = sum(ys[:, 1])
        total = ys.shape[0]
        print("Positive: %s/%s, %.2f%%"%(int(sum_positive), total, sum_positive / total * 100))
    return Xs, ys

In [3]:
def get_ma(df, interval, ref='close'):
    dfo = pd.DataFrame.copy(df)
    ref_list = dfo[ref].tolist()
    ma = []
    for i in range(len(ref_list)):
        if i <= interval-1:
            p = 0
            ma.append(p)
        else:
            p = np.average(ref_list[i-interval+1: i+1])
            ma.append(p)
    dfo['ma%s'%interval] = ma
    dfo = dfo[dfo['ma%s'%interval] != 0]
    return dfo

In [34]:
def get_distance(df, base, refs=None, scale=False):
    dfo = pd.DataFrame.copy(df)
    distance_ref = ['open', 'close', 'high', 'low']
    if refs != None and type(refs) == list:
        distance_ref += refs
    distance_ref_cols = ['%s_to_%s'%(i, base) for i in distance_ref]
    for i in range(len(distance_ref)):
        dfo[distance_ref_cols[i]] = dfo[distance_ref[i]] - dfo[base]
    if scale == True:
        scaler = MinMaxScaler((0, 1))
        dfo[distance_ref_cols] = scaler.fit_transform(dfo[distance_ref_cols])
    return dfo

In [78]:
def get_locations(df, cols):
    tick = pd.DataFrame.copy(df)
    scaler = MinMaxScaler((0, 1))
    cols = ['open', 'close', 'high', 'low', 'ma5', 'ma20', 'ma120']
    cols_s = ['p_%s'%i for i in cols]
    for i in tick.index:
        scaled_values = scaler.fit_transform(tick.loc[i, cols].values.reshape(len(cols), -1)).reshape(len(cols))
        for j in range(len(cols_s)):
            tick.loc[i, cols_s[j]] = scaled_values[j]
    return tick

In [95]:
def confusion_max(X_test, y_test, model):
    tn, fp, fn, tp = confusion_matrix([np.argmax(i) for i in y_test], [np.argmax(i) for i in model.predict(X_test)]).ravel()
    print('True Positive: %s'%tp)
    print('False Positive: %s'%fp)
    print('True Negative: %s'%tn)
    print('False Negative: %s'%fn)
    print('Percision: %.4f'%(tp / (tp + fp)))
    print('Recall: %.4f'%(tp / (tp + fn)))

In [216]:
code = '600313'
start = '2006-01-01'
end = '2018-12-31'
test_size = 200

In [217]:
tick_o = ts.get_k_data(code, start=start, end=end)

In [218]:
tick = pd.DataFrame.copy(tick_o)

In [219]:
for i in [5, 20, 120]:
    tick = get_ma(tick, i)

In [220]:
tick = get_distance(tick, 'ma5', ['ma20', 'ma120'], scale=True)

In [221]:
cols = ['open', 'close', 'high', 'low', 'ma5', 'ma20', 'ma120']
tick = get_locations(tick, cols)



In [226]:
x_cols = ['p_open', 'p_close', 'p_high', 'p_low', 'p_ma5', 'p_ma20', 'p_ma120']
X, y = get_samples(tick, x_cols, x_matrix=True, y_base='close', y_target='high', \
                  duration = 20, forward=20, show_portion=True, threshold=0.1)

Positive: 1113/2202, 50.54%


In [227]:
X_train = X[:-test_size]
y_train = y[:-test_size]
X_test = X[-test_size:]
y_test = y[-test_size:]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [228]:
model = Sequential()
model.add(LSTM(8, input_shape=(X_train[0].shape), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='rmsprop')

In [229]:
model.fit(X_train, y_train, batch_size=32, epochs=200, validation_data=[X_val, y_val], verbose=2)

Train on 1601 samples, validate on 401 samples
Epoch 1/200
 - 2s - loss: 0.7353 - acc: 0.4859 - val_loss: 0.6887 - val_acc: 0.5436
Epoch 2/200
 - 1s - loss: 0.6898 - acc: 0.5434 - val_loss: 0.6856 - val_acc: 0.5411
Epoch 3/200
 - 1s - loss: 0.6865 - acc: 0.5397 - val_loss: 0.6844 - val_acc: 0.5486
Epoch 4/200
 - 1s - loss: 0.6870 - acc: 0.5372 - val_loss: 0.6845 - val_acc: 0.5411
Epoch 5/200
 - 1s - loss: 0.6872 - acc: 0.5415 - val_loss: 0.6843 - val_acc: 0.5561
Epoch 6/200
 - 1s - loss: 0.6849 - acc: 0.5503 - val_loss: 0.6837 - val_acc: 0.5786
Epoch 7/200
 - 1s - loss: 0.6837 - acc: 0.5628 - val_loss: 0.6831 - val_acc: 0.5885
Epoch 8/200
 - 1s - loss: 0.6829 - acc: 0.5703 - val_loss: 0.6810 - val_acc: 0.5835
Epoch 9/200
 - 1s - loss: 0.6838 - acc: 0.5540 - val_loss: 0.6824 - val_acc: 0.5686
Epoch 10/200
 - 1s - loss: 0.6821 - acc: 0.5646 - val_loss: 0.6835 - val_acc: 0.5636
Epoch 11/200
 - 1s - loss: 0.6815 - acc: 0.5659 - val_loss: 0.6825 - val_acc: 0.5711
Epoch 12/200
 - 1s - loss: 

<keras.callbacks.History at 0x25b6aee4d68>

In [230]:
confusion_max(X_test, y_test, model)

True Positive: 24
False Positive: 176
True Negative: 0
False Negative: 0
Percision: 0.1200
Recall: 1.0000


In [231]:
confusion_max(X_val, y_val, model)

True Positive: 206
False Positive: 134
True Negative: 50
False Negative: 11
Percision: 0.6059
Recall: 0.9493
