In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Flatten, Reshape, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D

Using TensorFlow backend.


## 我們一樣先讀進來 ETF 資料

In [2]:
data = pd.read_csv('./etf_data/AFK.csv')

# 這一次我們先來把資料做一些前處理

## 如果你有看過資料會發現有些欄位有0值，而這個應該要去除的

In [3]:
data.head(106).tail(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
103,2016-09-19,20.469999,20.5,20.030001,20.030001,2800,19.493157
104,2016-09-16,20.110001,20.110001,20.110001,20.110001,300,19.571012
105,2016-09-15,20.02,20.02,20.02,20.02,0,19.483424


In [4]:
data = data.loc[(data["Volume"]!=0)] # 保留那些正常的欄位
data.reset_index(drop=True, inplace=True) # 直接 reset index, 丟掉舊的, 直接換過去
data.head(106).tail(3) # 檢查一下去除掉了沒

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
103,2016-09-19,20.469999,20.5,20.030001,20.030001,2800,19.493157
104,2016-09-16,20.110001,20.110001,20.110001,20.110001,300,19.571012
105,2016-09-14,19.98,20.120001,19.889999,20.02,9600,19.483424


## 簡化處理一下

In [5]:
data.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2017-02-15,21.620001,21.870001,21.52,21.82,11500,21.82
1,2017-02-14,21.93,21.950001,21.469999,21.629999,15000,21.629999
2,2017-02-13,21.76,21.76,21.51,21.67,9800,21.67


In [6]:
del data['Date']
data = data.values
pd.DataFrame(data).head(3)

Unnamed: 0,0,1,2,3,4,5
0,21.620001,21.870001,21.52,21.82,11500.0,21.82
1,21.93,21.950001,21.469999,21.629999,15000.0,21.629999
2,21.76,21.76,21.51,21.67,9800.0,21.67


## 讓我們的資料由舊而新

In [7]:
data = data[::-1]
data

array([[    41.32    ,     41.400002,     41.32    ,     41.400002,
          4000.      ,     33.730652],
       [    42.150002,     43.      ,     41.799999,     42.110001,
          1400.      ,     34.309124],
       [    41.41    ,     41.41    ,     40.400002,     40.400002,
          1500.      ,     32.915902],
       ..., 
       [    21.76    ,     21.76    ,     21.51    ,     21.67    ,
          9800.      ,     21.67    ],
       [    21.93    ,     21.950001,     21.469999,     21.629999,
         15000.      ,     21.629999],
       [    21.620001,     21.870001,     21.52    ,     21.82    ,
         11500.      ,     21.82    ]])

## 分開我們要訓練的欄位

In [8]:
X = data[:, 0:5]
Y = data[:, 5]
print("X的資料是\n", X)
print("Y的資料是\n", Y)

X的資料是
 [[    41.32         41.400002     41.32         41.400002   4000.      ]
 [    42.150002     43.           41.799999     42.110001   1400.      ]
 [    41.41         41.41         40.400002     40.400002   1500.      ]
 ..., 
 [    21.76         21.76         21.51         21.67       9800.      ]
 [    21.93         21.950001     21.469999     21.629999  15000.      ]
 [    21.620001     21.870001     21.52         21.82      11500.      ]]
Y的資料是
 [ 33.730652  34.309124  32.915902 ...,  21.67      21.629999  21.82    ]


## 資料漲跌標記

In [9]:
for i in range(len(Y)):
    if (i+1 < len(Y) and Y[i] - Y[i+1] > 0):
        Y[i] = 1
    else:
        Y[i] = 0
print(Y)

[ 0.  1.  0. ...,  1.  0.  0.]


## 每多少天資料丟一次

In [10]:
day = 5
Y = Y[day:] 
XX = []
for i in range(day,len(X)):
    tmp = []
    for j in range(day-1, -1, -1):
        tmp.append(X[i-j])
    XX.append(tmp)
X = np.array(XX)
print("X的資料是\n", X)
print("Y的資料是\n", Y)

X的資料是
 [[[    42.150002     43.           41.799999     42.110001   1400.      ]
  [    41.41         41.41         40.400002     40.400002   1500.      ]
  [    41.490002     42.27         41.490002     41.880001   6900.      ]
  [    41.5          41.5          40.           41.         8100.      ]
  [    41.650002     41.650002     40.950001     41.099998   3400.      ]]

 [[    41.41         41.41         40.400002     40.400002   1500.      ]
  [    41.490002     42.27         41.490002     41.880001   6900.      ]
  [    41.5          41.5          40.           41.         8100.      ]
  [    41.650002     41.650002     40.950001     41.099998   3400.      ]
  [    41.68         41.68         40.900002     41.029999   5900.      ]]

 [[    41.490002     42.27         41.490002     41.880001   6900.      ]
  [    41.5          41.5          40.           41.         8100.      ]
  [    41.650002     41.650002     40.950001     41.099998   3400.      ]
  [    41.68         41.68 

# 來試試ETF吧

In [11]:
print("First  return array with shape ", X.shape)
print("Second return array with shape ", Y.shape)
X = X.reshape((-1, 5, 5, 1))
Y = Y.reshape((-1, 1))
print("First  return array with shape ", X.shape)
print("Second return array with shape ", Y.shape)

First  return array with shape  (2155, 5, 5)
Second return array with shape  (2155,)
First  return array with shape  (2155, 5, 5, 1)
Second return array with shape  (2155, 1)


## 建立CNN模型

In [12]:
model = Sequential()
# Block 1
model.add(Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=(5,5,1)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))

# Block 2
model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))

# Block 3
model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))

# Block 4
model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))

model.add(Flatten())
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


## 分割資料為訓練集與測試集
### 我們把前$\ \frac{4}{5}\ $組作為訓練，後$\ \frac{1}{5}\ $組作為測試

In [13]:
# setting training data and testing data
seperate_num = X.shape[0] // 5 * 4
X_train, Y_train = X[:seperate_num], Y[:seperate_num]
X_train = X_train.reshape((-1,5, 5, 1))
Y_train = Y_train.reshape((-1, 1))
print(X_train.shape, Y_train.shape)
X_test, Y_test = X[seperate_num:], Y[seperate_num:]
X_test = X_test.reshape((-1,5, 5, 1))
Y_test = Y_test.reshape((-1,1))
print(X_test.shape, Y_test.shape)

(1724, 5, 5, 1) (1724, 1)
(431, 5, 5, 1) (431, 1)


In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=4, batch_size=128)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x115d01c88>

## 訓練模型

In [15]:
score = model.evaluate(X_test, Y_test, verbose=0)
print("Total Loss on Testing Set : ", score[0]) # 越低越好
print("Accuracy of Testing Set : ", score[1]) # 越高越好

Total Loss on Testing Set :  7.96555510032
Accuracy of Testing Set :  0.505800464867


# 想要更準怎麼辦
- 調整各項參數
    - 每層神經元數量
    - 激發函數
    - 優化器

# 課堂練習：嘗試使用少一點的 conv 和 pool
收斂會不會比較快？

In [16]:
model = Sequential()
# Block 1
model.add(Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=(5,5,1)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))


model.add(Flatten())
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=4, batch_size=128)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x11cf98e10>

# 回家作業：我們可以改變我們的filter等參數試試看！