In [154]:
import pandas as pd
import numpy as np
import tushare as ts
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

In [2]:
# Get the tick data
df_tick = ts.get_k_data('sh', start='2010-01-01', end='2018-01-19')

In [2]:
# Get the shibor data
def get_shibor_data(start, end):
    df_shibor = []
    for i in range(start, end+1):
        df_shibor.append(ts.shibor_data(i))
    df_shibor = pd.concat(df_shibor).reset_index().drop('index', axis=1)
    df_shibor.date = df_shibor.date.astype('str')
    return df_shibor

In [126]:
# Get the tick data with duration
def get_tick(tick, start, end, duration=5, forward=3, ktype='D'):
    dfo = ts.get_k_data('sh', start=start, end=end, ktype=ktype)
    hold =[]
    for i in range(0+duration-1, len(dfo)-forward):
        line = {}
        df_hold = dfo.loc[i-duration+1: i, :]
        line['high'] = df_hold.high.max()
        line['low'] = df_hold.low.min()
        line['open'] = df_hold.open.tolist()[0]
        line['close'] = df_hold.close.tolist()[-1]
        line['mean_volume'] = df_hold.volume.mean()
        line['std_volume'] = df_hold.volume.std()
        line['date'] = df_hold.date.tolist()[-1]
        line['f_close'] = dfo.loc[i+forward, 'close']
        line['f_high'] = max(dfo.loc[i+1: i+forward, 'high'])
        line['f_low'] = min(dfo.loc[i+1: i+forward, 'low'])
        hold.append(line)
    dfo = pd.DataFrame(hold)[['date', 'open', 'close', 'high', 'low', 'mean_volume', 'std_volume', 'f_close', 'f_high', 'f_low']]
    return dfo

In [127]:
# Prepare the data
def prepare(tick, start, end, shibor=False, duration=5, forward=3, ktype='D'):
    df_tick = get_tick(tick, start, end, ktype=ktype)
    if shibor == True:
        df_shibor = get_shibor_data(int(start[:4]), int(end[:4]))
        df = pd.merge(df_tick, df_shibor, on='date', how='left')
    else:
        df = pd.DataFrame.copy(df_tick)
    df['close_up_half'] = df.apply(lambda x: float(((x['f_close'] - x['close']) / x['close']) >= 0.005), axis=1)
    df['close_up_one'] = df.apply(lambda x: float(((x['f_close'] - x['close']) / x['close']) >= 0.01), axis=1)
    df['close_up_two'] = df.apply(lambda x: float(((x['f_close'] - x['close']) / x['close']) >= 0.02), axis=1)
    df['close_up_five'] = df.apply(lambda x: float(((x['f_close'] - x['close']) / x['close']) >= 0.05), axis=1)
    df['up_half'] = df.apply(lambda x: float(((x['f_high'] - x['close']) / x['close']) >= 0.005), axis=1)
    df['up_one'] = df.apply(lambda x: float(((x['f_high'] - x['close']) / x['close']) >= 0.01), axis=1)
    df['up_two'] = df.apply(lambda x: float(((x['f_high'] - x['close']) / x['close']) >= 0.02), axis=1)
    df['up_five'] = df.apply(lambda x: float(((x['f_high'] - x['close']) / x['close']) >= 0.05), axis=1)
    df['down_half'] = df.apply(lambda x: float(((x['close'] - x['f_low']) / x['close']) >= 0.005), axis=1)
    df['down_one'] = df.apply(lambda x: float(((x['close'] - x['f_low']) / x['close']) >= 0.01), axis=1)
    df['down_two'] = df.apply(lambda x: float(((x['close'] - x['f_low']) / x['close']) >= 0.02), axis=1)
    df['down_five'] = df.apply(lambda x: float(((x['close'] - x['f_low']) / x['close']) >= 0.05), axis=1)
    # df = df[cols]
    return df

In [171]:
ticko = prepare('sh', start='2010-01-01', end='2018-01-19')

In [235]:
y_col = 'up_one'
cols = ['high', 'low', 'mean_volume', 'std_volume', y_col]
tick = ticko[cols]

X_train, X_test, y_train, y_test = train_test_split(tick[tick.columns[0:-1]], tick[tick.columns[-1]])
sum(ticko[y_col]) / len(ticko)

0.4823167606355715

In [236]:
X_train_NN = X_train.as_matrix()
y_train_NN = to_categorical(y_train.as_matrix(), num_classes=2)
X_test_NN = X_test.as_matrix()
y_test_NN = to_categorical(y_test.as_matrix(), num_classes=2)

In [237]:
y_train_NN[0]

array([ 1.,  0.])

In [240]:
clf = Sequential()
clf.add(Dense(16, input_shape=(4,), activation='relu'))
clf.add(Dropout(0.5))
clf.add(Dense(16, input_shape=(4,), activation='relu'))
clf.add(Dense(2, activation='sigmoid'))

clf.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
clf.fit(X_train_NN, y_train_NN, epochs=5, batch_size=64, validation_split=0.25)


Train on 1097 samples, validate on 366 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x119b97ceef0>

0.69656586365966167

In [156]:
to_categorical(y_train.as_matrix()[0], 2).reshape(2, -1)

array([[ 1.],
       [ 0.]])

In [160]:
[to_categorical(i, 2).reshape(2, -1) for i in y_train.as_matrix()]

[array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 1.],
        [ 0.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 1.],
        [ 0.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([[ 0.],
        [ 1.]]), array([

In [191]:
data = np.random.random((1000, 100))

In [214]:
labels = np.random.randint(10, size=(1000, 1))

In [217]:
one_hot_labels = to_categorical(labels, num_classes=10)

In [220]:
one_hot_labels[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.])