# 通过CNN分析价格曲线，预测涨跌  
 
......


In [None]:
import os

import datetime
from tqdm import tqdm

import csv
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt 

import baostock as bs   # 股票宝，获取股票数据

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Reshape,Dropout,Activation
from tensorflow.keras.layers import Conv2D,MaxPooling2D
from tensorflow.keras.layers import Conv1D,MaxPooling1D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def download():
    # 从股票宝下载股票数据
    bs.login()
    for stock_code in tqdm(stock_code_list):
        stock_info_path = "stock_info/" + stock_code + ".csv"
        if not os.path.exists(stock_info_path) or re_download:
            rs = bs.query_history_k_data(stock_code, "date, open, close, volume, amount, turn, pctChg", start_date=start_date, end_date=to_date, frequency="d", adjustflag="3")
            # volume 成交量
            # amount 成交额
            # turn 换手率

            data_list = []
            while (rs.error_code == '0') & rs.next():  # 获取一条记录，将记录合并在一起
                data_list.append(rs.get_row_data())
            result = pd.DataFrame(data_list, columns=rs.fields)
            result.to_csv(stock_info_path, index=False)
    bs.logout()

In [149]:
def preprocess():
    stock_info_path = "stock_info/" + stock_code + ".csv"       # 文件路径
    # 读取csv文件
    stock = pd.read_csv(stock_info_path, parse_dates=['date'])
    
    if i == 0:
        pass
    else:
        stock = stock[:-i]

    # 准备数据
    stock['close_nomalized'] = (stock['close']-stock['close'].min())/(stock['close'].max()-stock['close'].min())        # 收盘价 归一化
    stock['volume_nomalized'] = (stock['volume']-stock['volume'].min())/(stock['volume'].max()-stock['volume'].min())   # 交易量 归一化
    stock['avg_price'] = stock['close'].rolling(predict_period).mean()                                                  # 最近周期内的平均股价
    stock = stock[predict_period-1:]
    stock['future_price'] = stock['close'].rolling(predict_period).mean().shift(-predict_period)                        # 未来股价均值(不包含当日收盘价)
    # stock = stock.dropna(axis=0)                                                                                      # 去除空值

    def flat_or_not(x):
        if x >= threshold_flat:
            return 2       # 涨
        elif x <= -threshold_flat:
            return 1       # 跌
        elif np.isnan(x):
            return np.nan
        else:
            return 0       # 持平

    stock['label'] = ((stock['future_price'] - stock['avg_price']) / stock['avg_price']).apply(flat_or_not)

    n = len(stock)

    if not cnn_3d_flag:
        x = np.array([stock['close_nomalized'][i:i+history_period] for i in range(n-history_period-predict_period+1)]).reshape(-1, 20, 20) # 输入 400天 （0:400）~（n-400-predict~n）
        x = x[:, :, :, np.newaxis]
    else:
        x = np.array([stock[['close_nomalized', 'volume_nomalized']][i:i+history_period] for i in range(n-history_period+1)]).reshape(-1, 20, 20, 2) # 输入 400天 + 交易量
        x = x[:, :, :, :, np.newaxis]

        x, x_valid = np.split(x, [-predict_period])

    y = stock['label'][history_period-1:].values[:-predict_period]                                               # 标签 
    # print(pd.DataFrame(y)[0].value_counts())    # 打印三种类别样本的个数。
    return stock, x, y, x_valid

In [95]:
def train():
        model = keras.models.Sequential()
        model.add(keras.layers.Conv2D(32, 3, activation='relu', input_shape=(20, 20, 1)))         # 卷积核的个数 => 输出的维度
        model.add(keras.layers.MaxPooling2D((2, 2)))
        model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(keras.layers.MaxPooling2D((2, 2)))
        model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))

        model.add(keras.layers.Flatten())
        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dense(3))
        model.compile(optimizer='adam', 
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
        # model = tf.keras.models.load_model('saved_model.h5')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=6, verbose=1, mode='auto')

        model.fit(x_train, y_train, epochs=epoch, validation_data=(x_test, y_test), callbacks = [monitor])
        # tf.saved_model.save(model, 'saved_model/')
        model.save('saved_model.h5')

In [None]:
def train_3d():
        model = keras.models.Sequential()
        model.add(keras.layers.Conv3D(32, (3, 3, 1), activation='relu', input_shape=(20, 20, 2, 1)))         # 卷积核的个数 => 输出的维度
        model.add(keras.layers.MaxPool3D((2, 2, 1)))
        model.add(keras.layers.Conv3D(64, (3, 3, 1), activation='relu'))
        model.add(keras.layers.MaxPool3D((2, 2, 1)))
        model.add(keras.layers.Conv3D(64, (3, 3, 1), activation='relu'))

        model.add(keras.layers.Flatten())
        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dense(3))
        model.compile(optimizer='adam', 
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
        # model = tf.keras.models.load_model('saved_model.h5')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=6, verbose=1, mode='auto')

        model.fit(x_train, y_train, epochs=epoch, validation_data=(x_test, y_test), callbacks = [monitor])
        # tf.saved_model.save(model, 'saved_model/')
        model.save('saved_model.h5')

In [None]:
def predict():
    """
        preprocess中已经根据i，缩短了stock从而x,y都无需额外处理
    """
    # 读取模型
    model = tf.keras.models.load_model('saved_model.h5')
    # model = tf.saved_model.load('saved_model/')
    xi = tf.convert_to_tensor(x[[-1]], tf.float32, name='inputs')
    predictions = model(xi)
    score = tf.nn.softmax(predictions[0])
    class_names = {
        0: "持平",
        1: "跌",
        2: "涨"
    }
    # print("Price: {}".format(stock['close'].values[-1]))
    # print(
    #     "Stock {} most likely {} with a {:.2f} percent confidence."
    #     .format(stock_code, class_names[np.argmax(score)], 100 * np.max(score))
    # )
    return stock['close'].values[-1], np.argmax(score), 100 * np.max(score)


In [148]:
def predict_v2():
    """
        不重新训练模型，一次性预测所有天数
    """

    # 读取模型
    model = tf.keras.models.load_model('saved_model.h5')
    score = tf.nn.softmax(model(x_valid))
    a = pd.Series([np.argmax(item) for item in score])
    n = len(stock)
    a.index=stock.index[-predict_period:]
    stock['label_valid'] = a

In [None]:
to_date = datetime.datetime.now().strftime("%Y-%m-%d")      # 今日日期
re_download = False              # 重新下载数据

# 超参
re_train = False                 # 重新训练
history_period = 400             # 分析天数
predict_period = 6               # 预测天数
epoch = 200                      # 训练最大圈数
start_date = '2010-01-01'        # 最早数据
threshold_flat = 0.007           # 判定股价持平的阈值
threshold_prob = 70              # 买卖时概率的阈值
stock_code_list = pd.read_csv('stock_codes.csv')['code']    # 需要预测的股票代码
cnn_3d_flag = True               # 3维CNN

# 验证设置
verify_period = 20                # 验证周期
simulation = {
    'bought': False,
    'price': 0,
    'asset': 1
}

# 下载数据
download()

df_verification = pd.Series(dtype=np.float64)
for i in tqdm(range(verify_period, -1, -1)):              # 验证天数， n ~ 0
    for stock_code in (stock_code_list):        # 股票代码
        stock, x, y, x_valid = preprocess()                  # 预处理
    
        try:
            x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,shuffle=True)        # 分割数据集
            if re_train:
                if not cnn_3d_flag:
                    train()
                else:
                    train_3d()
            predict()                # 预测
        except Exception as e:
            with open("logs/log.csv", 'a') as f:
                csv.writer(f).writerow([stock_code, e, e.__traceback__.tb_lineno])

# df_verification = df_verification.reindex(index=stock.index[-verify_period-1:])
df_verification.index = stock.index[-verify_period-1:]
stock['label_predict'] = df_verification
stock.to_csv('prediction_results/'+stock_code+'.csv')
print(stock_code, simulation['asset'])

        

In [150]:
stock, x, y, x_valid = preprocess()
predict_v2()

In [151]:
x, x_valid = np.split(x, [-predict_period+1])

Unnamed: 0,date,open,close,volume,amount,turn,pctChg,close_nomalized,volume_nomalized,avg_price,future_price,label,label_valid
2735,2021-04-07,5141.6556,5103.7428,14576374800,246271200000.0,0.525246,-0.711995,0.810797,0.186389,5109.918017,4994.089617,1.0,
2736,2021-04-08,5078.2627,5112.2086,14177104700,228422000000.0,0.510867,0.165874,0.813073,0.180381,5112.831267,4969.7517,1.0,
2737,2021-04-09,5100.0422,5035.3374,12841650900,221519800000.0,0.462744,-1.503679,0.792412,0.160285,5110.660717,4978.364883,1.0,
2738,2021-04-12,5026.9791,4947.7459,15264431400,267218200000.0,0.550005,-1.739536,0.768871,0.196743,5083.4889,5000.968133,1.0,
2739,2021-04-13,4949.8074,4939.6438,13291935800,225753200000.0,0.478854,-0.163753,0.766694,0.167061,5046.503383,,,1.0
2740,2021-04-14,4945.6857,4980.6279,12086083800,221036300000.0,0.435412,0.829697,0.777709,0.148915,5019.8844,,,1.0
2741,2021-04-15,4969.9099,4948.9741,10882938400,213820900000.0,0.392067,-0.635538,0.769201,0.13081,4994.089617,,,1.0
2742,2021-04-16,4966.8999,4966.1811,11123992400,213389200000.0,0.400745,0.347688,0.773826,0.134437,4969.7517,,,1.0
2743,2021-04-19,4966.409,5087.0165,15476982400,307899800000.0,0.55756,2.433165,0.806302,0.199941,4978.364883,,,1.0
2744,2021-04-20,5065.7757,5083.3654,13654170000,276640600000.0,0.491852,-0.071773,0.805321,0.172512,5000.968133,,,1.0


In [137]:
stock[-10:]

In [140]:
stock.shape

(2740, 12)

In [139]:
y.shape

(2335,)

In [None]:
df_verification = df_verification.append(pd.Series(trend), ignore_index=True)
# 计算盈亏
if not simulation['bought'] and trend == 1 and prob >= 70:
    simulation['bought'] = True
    simulation['price'] = close
elif simulation['bought'] and trend == 2 and prob >= 70:
    simulation['bought'] = False
    simulation['asset'] = simulation['asset']/simulation['price']*close

In [158]:
stock_info_path = "stock_info/" + stock_code + ".csv"       # 文件路径
# 读取csv文件
stock = pd.read_csv(stock_info_path, parse_dates=['date'])

# 准备数据
stock['close_nomalized'] = (stock['close']-stock['close'].min())/(stock['close'].max()-stock['close'].min())        # 收盘价 归一化
stock['volume_nomalized'] = (stock['volume']-stock['volume'].min())/(stock['volume'].max()-stock['volume'].min())   # 交易量 归一化
stock['avg_price'] = stock['close'].rolling(predict_period).mean()                                                  # 最近周期内的平均股价
stock = stock[predict_period-1:]
stock['future_price'] = stock['close'].rolling(predict_period).mean().shift(-predict_period)                        # 未来股价均值(不包含当日收盘价)
# stock = stock.dropna(axis=0)                                                                                      # 去除空值

def flat_or_not(x):
    if x >= threshold_flat:
        return 2       # 涨
    elif x <= -threshold_flat:
        return 1       # 跌
    elif np.isnan(x):
        return np.nan
    else:
        return 0       # 持平

stock['label'] = ((stock['future_price'] - stock['avg_price']) / stock['avg_price']).apply(flat_or_not)

n = len(stock)

x = np.array([stock[['close_nomalized', 'volume_nomalized']][i:i+history_period] for i in range(n-history_period+1)]).reshape(-1, 20, 20, 2) # 输入 400天 + 交易量
x = x[:, :, :, :, np.newaxis]

x_valid = x[-40:]                                         # 标签 

In [159]:
x_valid.shape

(40, 20, 20, 2, 1)

In [162]:
model = tf.keras.models.load_model('saved_model.h5')
score = tf.nn.softmax(model(x))
a = pd.Series([np.argmax(item) for item in score])
a.index=stock.index[history_period-1: -predict_period]
stock['label_predict'] = a
# stock.to_csv('tmp.csv')

ValueError: Length mismatch: Expected axis has 2341 elements, new values have 2335 elements

In [161]:
model = tf.keras.models.load_model('saved_model.h5')
score = tf.nn.softmax(model(x_valid))
a = pd.Series([np.argmax(item) for item in score])
a.index=stock.index[-40:]
stock['label_valid'] = a
stock.to_csv('tmp.csv')