## 使用自定义 RNN 模型进行股价预测

在这个 Notebook 中，我们将使用白酒行业的几支股票数据（开盘价、收盘价、最高价、最低价）来训练模型用于对贵州茅台（600519）股价进行预测。通常的假设是，同一个行业板块中的几支股票价格往往具有一定的正相关性，同时使用这些关联数据进行模型训练往往可以获得更好的预测效果。首先我们会使用一个传统的 RNN 算法进行单支股票的训练和预测。

开始之前，清选择 conda_python3 内核。

SageMaker 支持使用自定义的容器和算法（BYOC）进行模型训练。在这个演示中，我们将使用最近的1000个交易日进行模型训练和预测。这里使用的是来自于 Keras 的 GRU layer（详情可以参照 train 脚本中的相关代码）。

### 定义参数

开始之前我们先要定义此次实验会用到的一些参数。这包括用到的股票代码、一些机器学习模型的超参等：

In [1]:
import boto3

aws_account_id = boto3.client('sts').get_caller_identity().get('Account')
my_name = 'peng'
aws_region = 'us-east-1'
bucket = 'algo-trading-workshop-{}'.format(my_name)
repository_name = 'sagemaker-custom-rnn'

!mkdir {repository_name}

In [2]:
target_stock = "600519" #  贵州茅台
covariate_stocks = "000596,000568,000858,600779,002304" # 古井贡酒、泸州老窖、五粮液、水井坊、洋河股份
target_column = "close" # 需要预测的目标值
covariate_columns = "open,low,high"
interval = 'D' # 基于日线进行预测
lag = 10 # 在预测时回顾的历史数据长度
horizon = 5 # 在预测是前进的未来步数
dateformat = '%Y-%m-%d'
num_epochs = 1000
percent_train = 85.0
num_units = 256
batch_size = 4096
dropout_ratio = 0.1

hyperparameters = {
    "interval": interval,
    "lag": str(lag),
    "horizon": str(horizon),
    "num_epochs": str(num_epochs),
    "batch_size": str(batch_size),
    "percent_train": str(percent_train),
    "num_units": str(num_units),
    "target_stock": target_stock,
    "covariate_stocks": covariate_stocks,
    "target_column": target_column,
    "covariate_columns": covariate_columns,
    "dropout_ratio": str(dropout_ratio)
}

### 获取数据

定义一个函数，通过 Athena 获取行情数据：

In [3]:
# !pip install awswrangler

import awswrangler as wr

s3_output = wr.athena.create_athena_bucket()

def execute_query(database, sql):
    
    query_execution_id = wr.athena.start_query_execution(database=database, sql=sql)
    response = wr.athena.get_query_execution(query_execution_id=query_execution_id)
    wr.athena.wait_query(query_execution_id=query_execution_id)
    OutputLocation = response['ResultConfiguration']['OutputLocation']
    
    return OutputLocation

首先，我们先获取目标股（贵州茅台）的历史数据：

In [4]:
database = 'stock-data'
table = 'stock_day'
fields = '*'
ticker = target_stock
orderby = 'tradedate'
sort = 'DESC'
limit = '1000'

sql = f'''
SELECT {fields}
FROM "{database}"."{table}"
WHERE ticker='{ticker}'
ORDER BY {orderby}
{sort}
LIMIT {limit}
'''

output_location = execute_query(database, sql)

In [5]:
df = wr.s3.read_csv(path=[output_location])

df['ticker'] = df['ticker'].apply(lambda x: str(x))
df['ticker'] = df['ticker'].apply(lambda x: '0'*(6-len(x)) + x)
df['openprice'] = df['openprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
df['closeprice'] = df['closeprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
df['highestprice'] = df['highestprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
df['lowestprice'] = df['lowestprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
df = df[df['isopen'] == True]
df.drop('isopen', 1, inplace=True)
df.drop('accumadjfactor', 1, inplace=True)
df.drop('secid', 1, inplace=True)
df.set_index('tradedate', inplace=True)
df.sort_index(0, inplace=True)

df.rename(columns={'openprice': 'open'}, inplace=True)
df.rename(columns={'closeprice': 'close'}, inplace=True)
df.rename(columns={'highestprice': 'high'}, inplace=True)
df.rename(columns={'lowestprice': 'low'}, inplace=True)
df.rename(columns={'turnovervol': 'volume'}, inplace=True)
df.rename(columns={'turnovervalue': 'value'}, inplace=True)
df.head()

Unnamed: 0_level_0,ticker,open,high,low,close,volume,value
tradedate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-05-26,600519,450.82,455.6,448.0,451.92,2738664.0,1237323000.0
2017-05-31,600519,450.0,450.5,440.11,442.94,4431225.0,1968887000.0
2017-06-01,600519,442.5,449.95,441.01,449.28,4060478.0,1808306000.0
2017-06-02,600519,450.0,450.95,445.6,447.31,2178526.0,974893300.0
2017-06-05,600519,448.04,449.0,442.35,444.41,1924120.0,854876900.0


In [6]:
start_date = df.index[0]
end_date = df.index[-1]
print('Target stock:', ticker, start_date, '-', end_date)

import pandas as pd

stock_data = pd.DataFrame([])
stock_data = pd.concat([stock_data, df])

Target stock: 600519 2017-05-26 - 2021-07-05


然后我们获取与贵州茅台相关联的股票历史数据：

In [16]:
stock_data.groupby('ticker').count()

Unnamed: 0_level_0,open,high,low,close,volume,value
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
568,999,999,999,999,999,999
596,999,999,999,999,999,999
858,1000,1000,1000,1000,1000,1000
2304,999,999,999,999,999,999
600519,999,999,999,999,999,999
600779,997,997,997,997,997,997


In [12]:
covariates = covariate_stocks.split(',')
for ticker in covariates:
    
    sql = f'''
    SELECT {fields}
    FROM "{database}"."{table}"
    WHERE ticker='{ticker}'
    AND tradedate>='{start_date}'
    ORDER BY {orderby}
    {sort}
    '''
    
    output_location = execute_query(database, sql)
    df = wr.s3.read_csv(path=[output_location])
    
    df['ticker'] = df['ticker'].apply(lambda x: str(x))
    df['ticker'] = df['ticker'].apply(lambda x: '0'*(6-len(x)) + x)
    df['openprice'] = df['openprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
    df['closeprice'] = df['closeprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
    df['highestprice'] = df['highestprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
    df['lowestprice'] = df['lowestprice'] * df['accumadjfactor'] / df['accumadjfactor'].iloc[-1]
    df = df[df['isopen'] == True]
    df.drop('isopen', 1, inplace=True)
    df.drop('accumadjfactor', 1, inplace=True)
    df.drop('secid', 1, inplace=True)
    df.set_index('tradedate', inplace=True)
    df.sort_index(0, inplace=True)

    df.rename(columns={'openprice': 'open'}, inplace=True)
    df.rename(columns={'closeprice': 'close'}, inplace=True)
    df.rename(columns={'highestprice': 'high'}, inplace=True)
    df.rename(columns={'lowestprice': 'low'}, inplace=True)
    df.rename(columns={'turnovervol': 'volume'}, inplace=True)
    df.rename(columns={'turnovervalue': 'value'}, inplace=True)
    
    print(ticker, df.shape, df.index[0], df.index[-1])
    
    stock_data = pd.concat([stock_data, df])

000596 (999, 7) 2017-05-26 2021-07-05
000568 (999, 7) 2017-05-26 2021-07-05
000858 (1000, 7) 2017-05-26 2021-07-05
600779 (997, 7) 2017-05-26 2021-07-05
002304 (999, 7) 2017-05-26 2021-07-05


最后，将数据保存到 S3 指定路径：

In [17]:
wr.s3.to_csv(df=stock_data, path='s3://{}/{}/data/stockdata.csv'.format(bucket, repository_name))

{'paths': ['s3://algo-trading-workshop-peng/sagemaker-custom-rnn/data/stockdata.csv'],
 'partitions_values': {}}

### 构建容器镜像


首先在 ECR 中创建相应的镜像仓库：

In [18]:
ecr = boto3.client('ecr', region_name=aws_region)
ecr.create_repository(repositoryName=repository_name)

{'repository': {'repositoryArn': 'arn:aws:ecr:us-east-1:364198545638:repository/sagemaker-custom-rnn',
  'registryId': '364198545638',
  'repositoryName': 'sagemaker-custom-rnn',
  'repositoryUri': '364198545638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-custom-rnn',
  'createdAt': datetime.datetime(2021, 12, 9, 8, 49, 10, tzinfo=tzlocal()),
  'imageTagMutability': 'MUTABLE',
  'imageScanningConfiguration': {'scanOnPush': False},
  'encryptionConfiguration': {'encryptionType': 'AES256'}},
 'ResponseMetadata': {'RequestId': '8c841f45-c1cb-4fec-a69e-95681a75df9a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8c841f45-c1cb-4fec-a69e-95681a75df9a',
   'date': 'Thu, 09 Dec 2021 08:49:10 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '413'},
  'RetryAttempts': 0}}

接下来定义 Dockerfile 并构建镜像：

In [19]:
%%writefile {repository_name}/Dockerfile
FROM tensorflow/tensorflow:2.3.4
RUN apt-get -y update && apt-get install -y --no-install-recommends \
		 apt-utils \
		 ca-certificates \
		 curl \
		 git \
		 rsync \
		 nginx \
		 software-properties-common \
		 vim \
         wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

RUN pip --no-cache-dir install \
        keras==2.4.3 \
        matplotlib \
        pandas \
        pytz \
        pyyaml \
        s3fs \
		sklearn

RUN pip --no-cache-dir install \
        flask \
        gevent \
        gunicorn
		
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/ml/code:${PATH}"

RUN mkdir -p /opt/ml/code
RUN mkdir -p /opt/ml/failure
RUN mkdir -p /opt/ml/model
RUN mkdir -p /opt/ml/output

COPY model /opt/ml/code
RUN chmod -R 777 /opt/ml/code
WORKDIR /opt/ml/code

Writing sagemaker-custom-rnn/Dockerfile


In [20]:
!docker build {repository_name} -t {repository_name}

Sending build context to Docker daemon  87.55kB
Step 1/14 : FROM tensorflow/tensorflow:2.3.4
2.3.4: Pulling from tensorflow/tensorflow

[1B53061382: Pulling fs layer 
[1Bf1f52889: Pulling fs layer 
[1B083dd30a: Pulling fs layer 
[1B3f47c17b: Pulling fs layer 
[1Be8609f80: Pulling fs layer 
[1B215676e7: Pulling fs layer 
[1B5c752173: Pulling fs layer 
[1Bd7f83f62: Pull complete 084kB/1.084kBB[8A[2K[7A[2K[8A[2K[6A[2K[5A[2K[7A[2K[8A[2K[8A[2K[7A[2K[5A[2K[4A[2K[6A[2K[2A[2K[3A[2K[6A[2K[3A[2K[8A[2K[3A[2K[6A[2K[3A[2K[8A[2K[3A[2K[6A[2K[3A[2K[6A[2K[8A[2K[3A[2K[3A[2K[8A[2K[6A[2K[8A[2K[3A[2K[6A[2K[8A[2K[6A[2K[3A[2K[6A[2K[3A[2K[6A[2K[8A[2K[3A[2K[6A[2K[8A[2K[6A[2K[6A[2K[6A[2K[8A[2K[6A[2K[6A[2K[7A[2K[3A[2K[7A[2K[7A[2K[3A[2K[3A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[3A[2K[7A[2K[7A[2K[3A[2K[7A[2K[3A[

In [21]:
!docker tag {repository_name} {aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{repository_name}

In [22]:
!docker images

REPOSITORY                                                             TAG       IMAGE ID       CREATED             SIZE
364198545638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-custom-rnn      latest    be842643bd29   1 second ago        1.92GB
sagemaker-custom-rnn                                                   latest    be842643bd29   1 second ago        1.92GB
364198545638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-hpo             latest    7479592629f6   42 minutes ago      1.08GB
sagemaker-hpo                                                          latest    7479592629f6   42 minutes ago      1.08GB
364198545638.dkr.ecr.us-east-1.amazonaws.com/ecs-gridsearch            latest    198b80057241   58 minutes ago      1.23GB
ecs-gridsearch                                                         latest    198b80057241   58 minutes ago      1.23GB
364198545638.dkr.ecr.us-east-1.amazonaws.com/ecs-demo-php-simple-app   latest    b7100a00e52a   About an hour ago   529MB
ecs-demo-php-simple

In [23]:
!aws ecr get-login-password | docker login --username AWS --password-stdin {aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [24]:
!docker push {aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{repository_name}

Using default tag: latest
The push refers to repository [364198545638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-custom-rnn]

[1B1f6e39e8: Preparing 
[1Ba75c9e6d: Preparing 
[1Bd7e89141: Preparing 
[1B84a46018: Preparing 
[1B709ce4c9: Preparing 
[1B45ec0480: Preparing 
[1B747fac0e: Preparing 
[1B4270867f: Preparing 
[1Bfc87d570: Preparing 
[1B0c1cfac5: Preparing 
[1Bf44780c7: Preparing 
[1B25925860: Preparing 
[1B801490de: Preparing 
[1Bedb7deb4: Preparing 
[1B33137d72: Preparing 
[1B41e3ef47: Preparing 
[6B25925860: Pushed   981.8MB/970.5MB[14A[2K[17A[2K[12A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[10A[2K[11A[2K[12A[2K[10A[2K[9A[2K[10A[2K[11A[2K[8A[2K[9A[2K[11A[2K[9A[2K[7A[2K[11A[2K[10A[2K[10A[2K[11A[2K[9A[2K[10A[2K[9A[2K[11A[2K[9A[2K[11A[2K[9A[2K[11A[2K[9A[2K[6A[2K[9A[2K[10A[2K[9A[2K[10A[2K[9A[2K[10A[2K[7A[2K[10A[2K[

In [25]:
image_url = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(aws_account_id, aws_region, repository_name)
print('镜像：', image_url)

镜像： 364198545638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-custom-rnn


### 训练自定义算法模型

要在Amazon SageMaker中训练模型，您需要创建训练任务。在以下的示例中，我们将使用 SageMaker 的 Estimator API 进行操作。训练完成后，SageMaker会将代码存储在容器中路径 /opt/ml/model 下的所有文件存档，并使这些文件打包在名为 model 的 .tar.gz 文件上传至 S3 的指定位置。

在算法中，我们自动将数据集进行了 training 和 testing 的划分。同时也记录了模型训练的 loss history，可以在最终上传至 S3 的 tar 文件进行查看。

接下来我们将通过以下代码开始模型的训练，训练过程将经历 1000 个 epoch：

In [26]:
# !pip uninstall -y sagemaker
# !pip install sagemaker==1.72.0
import sagemaker
from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role()
session = sagemaker.Session()

In [27]:
# 创建容器镜像和训练实例
rnn = Estimator(image_url, role, 1, 'ml.m5.2xlarge',
                output_path='s3://{}/{}/output'.format(bucket, repository_name),
                base_job_name=repository_name,
                sagemaker_session=session)

rnn.set_hyperparameters(**hyperparameters)

# 模型训练
rnn.fit('s3://{}/{}/data/stockdata.csv'.format(bucket, repository_name))

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'Number of instances across all training jobs' is 4 Instances, with current utilization of 4 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

训练完成之后，可以通过 SageMaker SDK 获取最新的训练结果：

In [None]:
# 定义路径
estimator_job = rnn.latest_training_job.job_name
model_archive = "{}/output/{}/output/model.tar.gz".format(repository_name, estimator_job)
print("Estimator created at completion of training job {}".format(estimator_job))
print('模型路径：', model_archive)

这个模型在训练的时候将 loss 信息存放在了 output 的 csv 文件中。通常在训练完成后我们可以观察训练过程 loss history 的变化。可以看到，loss 随着 epoch 增加呈稳定下降的趋势：

In [None]:
import io
import matplotlib
from matplotlib import pyplot
import pandas as pd
import tarfile


def load_csv_from_s3_tarfile(s3, bucket, key, filename, outputpath = None, index = None, sep=","):
        
        bytestream = io.BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
        compressesfile= tarfile.open(fileobj=bytestream)
        extractedfile = compressesfile.extractfile(filename)  

        if index is not None and type(index) is int:
            data = pd.read_csv(extractedfile, sep, index_col=index)
        else:
            data = pd.read_csv(extractedfile, sep)

        if outputpath is not None and type(outputpath) == str:
            outputprefix = "{}/{}".format(outputpath, filename)
            csv_buffer = io.StringIO()
            data.to_csv(csv_buffer)
            s3_resource = boto3.resource('s3')
            s3_resource.Object(bucket, outputprefix).put(Body=csv_buffer.getvalue())        

        return data

    
def plot_loss(s3, bucket, key, filename):
    
    matplotlib.rcParams['figure.figsize'] = (15, 10) # use bigger graphs

    # plot history
    lossdata = load_csv_from_s3_tarfile(s3, bucket, model_archive, filename)
    pyplot.plot(lossdata.combo_out_loss, label='Combo Loss')
    pyplot.plot(lossdata.main_out_loss, label='Main Loss')
    pyplot.plot(lossdata.loss, label='Loss')
    pyplot.legend()
    pyplot.show()

In [None]:
import boto3

s3 = boto3.client('s3')

plot_loss(s3, bucket, model_archive, "loss_history.csv")

### 模型部署

模型训练成功后，可以选择将模型部署到实例进行使用，这个操作会需要一些时间。部署时需要选择适当的实例大小，以下示例将通过 SageMaker SDK 来完成，也可以选择在 SageMaker Studio 或 SageMaker 控制台中手动操作完成：

In [None]:
from sagemaker.predictor import csv_serializer

predictor = rnn.deploy(1, 'ml.m5.large', serializer=csv_serializer)

### 使用模型进行推理

如果上述的部署顺利完成，就可以创建 predictor 进行推理了。在训练代码中，我们已经将数据集分成了 training 和 test。我们首先将从训练结果的 artifact 中加载这些数据集，并且对预测的结果和真实的历史数据进行比较。

首先，我们创建一个用于推理的 predictor 类：

In [None]:
from sagemaker.predictor import RealTimePredictor

predictor = RealTimePredictor(estimator_job, content_type = "text/csv")

然后从 output 的压缩包中解压出 test 数据集：

In [None]:
import io
import pandas as pd
import re
import tarfile


def load_csv_from_s3_tarfile(s3, bucket, key, filename, outputpath = None, index = None, sep=","):
        
        bytestream = io.BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
        compressesfile= tarfile.open(fileobj=bytestream)
        extractedfile = compressesfile.extractfile(filename)  

        if index is not None and type(index) is int:
            data = pd.read_csv(extractedfile, sep, index_col=index)
        else:
            data = pd.read_csv(extractedfile, sep)

        if outputpath is not None and type(outputpath) == str:
            outputprefix = "{}/{}".format(outputpath, filename)
            csv_buffer = io.StringIO()
            data.to_csv(csv_buffer)
            s3_resource = boto3.resource('s3')
            s3_resource.Object(bucket, outputprefix).put(Body=csv_buffer.getvalue())        

        return data


def extract_matching_csv_files_from_s3_tarfile(s3, bucket, key, namepattern, outputpath = None, index = None, sep=",", parse_dates = True):
    
    bytestream = io.BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
    filenames = list(filter( lambda s: re.compile(namepattern).match(s), tarfile.open(fileobj=bytestream).getnames()))
    filepaths = []
    
    for filename in filenames:
        
        bytestream = io.BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
        tarf = tarfile.open(fileobj=bytestream)
        extractedfile = tarf.extractfile(filename)
        
        if index is not None and type(index) is int:
            data = pd.read_csv(extractedfile, index_col=index, delimiter=sep,parse_dates=parse_dates)
        else:
            data = pd.read_csv(extractedfile, delimiter=sep, parse_dates=parse_dates)

        if outputpath is not None and type(outputpath) == str:
            outputprefix = "{}/{}".format(outputpath, filename)
            output_s3_path = "s3://{}/{}".format(bucket, outputprefix)
            csv_buffer = io.StringIO()
            data.to_csv(csv_buffer)
            s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket, Key=outputprefix)
            filepaths.append(output_s3_path)
        else:
            filepaths.append(filename)
            
    return filepaths

In [None]:
import boto3

s3 = boto3.client('s3')

filepaths = extract_matching_csv_files_from_s3_tarfile(s3, bucket, model_archive, "test[0-9]+.csv", model_archive[:model_archive.find("/model")], 0)
print('数据路径：')
for f in filepaths:
    print(f)

testfilename = "testdata.csv"
testdata = load_csv_from_s3_tarfile(s3, bucket, model_archive, testfilename, model_archive[:model_archive.find("/model")], 0)
testdata['ticker'] = testdata['ticker'].apply(lambda x: str(x))
testdata['ticker'] = testdata['ticker'].apply(lambda x: '0'*(6-len(x)) + x)
print(testdata.shape)
testdata.head()

接下来将 test 数据集传递给 predictor 类进行时序预测，然后简单比较一下预测值和实际值的差距：

In [None]:
from matplotlib import pyplot

def plot_sample_predictions(predictor, filepaths, target_stock, target_column, lag):
    
    fig, axs = pyplot.subplots(int((len(filepaths)-1)/3)+1, 3, figsize=(30, 10))
    axx = axs.ravel()

    for k, filepath in enumerate(filepaths):
        
        data = pd.read_csv(filepath, index_col = 0)
        data['ticker'] = data['ticker'].apply(lambda x: str(x))
        data['ticker'] = data['ticker'].apply(lambda x: '0'*(6-len(x)) + x)
        test_main = data[data['ticker'] == target_stock].copy()  
        given = test_main[target_column].iloc[:-1]
        given.plot(ax = axx[k], use_index=True, legend=True, label="Given")
        preds = predictor.predict(filepath).decode("utf-8").split()
        
        for i, pred in enumerate(preds):
            preds[i] = float(pred[pred.find(',')+1:])
            
        predicted = test_main[target_column].iloc[:lag].append(pd.Series(preds))
        predicted.index = given.index
        predicted = predicted.reset_index()
        predicted[0].plot(ax = axx[k], legend=True, label="Predicted")

In [None]:
plot_sample_predictions(predictor, filepaths, target_stock, target_column, lag)

In [None]:
from matplotlib import pyplot


def test_sample_from_testdata(testdata, target_stock, covariate_stocks, lag, horizon, inc):
    
    target_stock_data = testdata[testdata['ticker'] == target_stock].copy()
    covariate_stock_data = []
    covariate_stock_list = covariate_stocks.split(",")
    
    for covariate_stock in covariate_stock_list:
        covariate_stock_data.append(testdata[testdata['ticker'] == covariate_stock.strip()])
        
    test_size = target_stock_data.shape[0] 
    span = lag + horizon + 1 
    testinputs = []
    num_test_samples = int(test_size/span)
    
    for i in range(0, test_size - span, inc):  
        
        test_input = target_stock_data.iloc[i:i+span]
        for cov in covariate_stock_data:
            test_input = test_input.append(cov.iloc[i:i+span])
        testinputs.append(test_input)  
        
    return testinputs    


def plot_sample_test_performance(predictor, testdata, target_stock, covariate_stock, target_column, lag, horizon, inc):

    testinputs = test_sample_from_testdata(testdata, target_stock, covariate_stock, lag, horizon, inc)

    fig, axs = pyplot.subplots(int((len(testinputs)-1)/3)+1, 3, figsize=(30, 30))
    axx = axs.ravel()  
    
    for k, testinput in enumerate(testinputs):
        test_main = testinput[testinput['ticker'] == target_stock].copy()  
        given = test_main[target_column].iloc[0:-1]
        given.plot(ax = axx[k], use_index=True, legend=True, label="Given")
        prediction = predictor.predict(testinput.to_csv())
        preds = prediction.decode("utf-8").split()
        
        for i, pred in enumerate(preds):
            preds[i] = float(pred[pred.find(',')+1:])
            
        predicted = test_main[target_column].iloc[:lag].append(pd.Series(preds))
        predicted.index = given.index
        predicted = predicted.reset_index()
        predicted[0].plot(ax = axx[k], legend=True, label="Predicted")                

In [None]:
plot_sample_test_performance(predictor, testdata, target_stock, covariate_stocks, target_column, lag, horizon, horizon)

In [None]:
import matplotlib


def test_sample_from_testdata(testdata, target_stock, covariate_stocks, lag, horizon, inc):
    
    target_stock_data = testdata[testdata['ticker'] == target_stock].copy()
    covariate_stock_data = []
    covariate_stock_list = covariate_stocks.split(",")
    
    for covariate_stock in covariate_stock_list:
        covariate_stock_data.append(testdata[testdata['ticker'] == covariate_stock.strip()])
        
    test_size = target_stock_data.shape[0] 
    span = lag + horizon + 1 
    testinputs = []
    num_test_samples = int(test_size/span)
    
    for i in range(0, test_size - span, inc):  
        
        test_input = target_stock_data.iloc[i:i+span]
        for cov in covariate_stock_data:
            test_input = test_input.append(cov.iloc[i:i+span])
        testinputs.append(test_input)  
        
    return testinputs


def plot_overall_test_performance(predictor, testdata, target_stock, covariate_stock, target_column, lag, horizon, inc):

    testinputs = test_sample_from_testdata(testdata, target_stock, covariate_stock, lag, horizon, inc)
        
    matplotlib.rcParams['figure.figsize'] = (25, 17) 
    ax = None
    target_stock_data = testdata[testdata['ticker'] == target_stock].copy()
    
    given = target_stock_data[target_column]
    ax = given.plot(ax = ax, legend=True, label="Given")

    for k, testinput in enumerate(testinputs):
        prediction = predictor.predict(testinput.to_csv())
        preds = prediction.decode("utf-8").split()
        
        for i, pred in enumerate(preds):
            preds[i] = float(pred[pred.find(',')+1:])
            
        predicted = target_stock_data[target_column].iloc[:k*inc+1+lag].append(pd.Series(preds))  
        predicted = predicted.reset_index()
        predicted[0].plot(ax = ax, legend=True, label="Predicted on {}".format(testinput.index[testinput.shape[0] - 1]))   

In [None]:
plot_overall_test_performance(predictor, testdata, target_stock, covariate_stocks, target_column, lag, horizon, horizon)

在终端节点不再使用后一定将其删除，否则会一直产生费用：

In [None]:
# predictor.delete_endpoint()