<a href="https://colab.research.google.com/github/renxiaowei/saved_code_analysis/blob/main/TensorFlow_time_series_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This documents are for the tensorflow-based code structure.
The original code is from 'https://github.com/jsyoon0823/Time-series-prediction'
The main target is to predict based on the sequence variation.
The code is released based on the python commond, not jupyter.

Here, I illustrate it based on the Jupyter format.

The first step is the runing of the code.
原先的仅仅是一条python command：

!python3 main_time_series_prediction.py --train_rate 1 --seq_len 7 --task regression --model_type lstm --h_dim 10 --n_layer 3 --batch_size 2 --epoch 2 --learning_rate 0.01 --metric_name mae

但是这里我们把main_time_series_prediction.py进行了拆分，在jupyter。


In [1]:
!git clone https://github.com/jsyoon0823/Time-series-prediction.git

Cloning into 'Time-series-prediction'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 39 (delta 0), reused 3 (delta 0), pack-reused 36[K
Unpacking objects: 100% (39/39), done.


In [2]:
!ls
%cd Time-series-prediction/

sample_data  Time-series-prediction
/content/Time-series-prediction


In [None]:
# 加载主函数中必要的package
# Necessary packages
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import warnings
warnings.filterwarnings("ignore")

from data_loader import data_loader
from basic_rnn_lstm_gru import GeneralRNN
from basic_attention import Attention
from utils import performance

In [None]:
# 定义主函数，包括：
# 加载training 和 testing dataset
# 加载预定义的参数值
# 定义model
# 做training
# 做testing
# 做性能计算

def main (args):  
  """Time-series prediction main function.
  
  Args:
    - train_rate: training data ratio
    - seq_len: sequence length
    - task: classification or regression
    - model_type: rnn, lstm, gru, or attention
    - h_dim: hidden state dimensions
    - n_layer: number of layers
    - batch_size: the number of samples in each mini-batch
    - epoch: the number of iterations
    - learning_rate: learning rates
    - metric_name: mse or mae
  """
  # Load data
  train_x, train_y, test_x, test_y = data_loader(args.train_rate, 
                                                 args.seq_len)
  
  # Model traininig / testing
  model_parameters = {'task': args.task,
                      'model_type': args.model_type,
                      'h_dim': args.h_dim,
                      'n_layer': args.n_layer,
                      'batch_size': args.batch_size,
                      'epoch': args.epoch,
                      'learning_rate': args.learning_rate}
  
  if args.model_type in ['rnn','lstm','gru']:
    general_rnn = GeneralRNN(model_parameters)  # 类的对象化  
    general_rnn.fit(train_x, train_y)
    test_y_hat = general_rnn.predict(test_x)
  elif args.model_type == 'attention':
    basic_attention = Attention(model_parameters)    
    basic_attention.fit(train_x, train_y)
    test_y_hat = basic_attention.predict(test_x)
  
  # Evaluation
  result = performance(test_y, test_y_hat, args.metric_name)
  print('Performance (' + args.metric_name + '): ' + str(result))
  

In [None]:
# 因为加载的参数比较多而杂，为了便于管理，定义一个类Para
# 基于这个类，可以简化参数结构。
# 同时这个类，实际上是跟source code中的，parser = argparse.ArgumentParser()，相匹配的。
class Para:
  train_rate = 0.8
  seq_len = 7
  task = 'regression'
  model_type = 'lstm'
  h_dim = 10
  n_layer = 3
  batch_size = 2
  epoch = 2
  learning_rate = 0.01
  metric_name = 'mae'

In [None]:
# 对参数进行赋值
# 运行主函数
args = Para()

args.train_rate = 0.8
args.seq_len = 7

args.task = 'regression'
args.model_type = 'lstm'
args.h_dim = 10
args.n_layer = 3
args.batch_size = 64
args.epoch = 100
args.learning_rate = 0.01
args.metric_name = 'mae'

# Call main function  
main(args)

# 备份的code

In [None]:
# 这个是source code中的关于参数传入的结构，主要是基于parser = argparse.ArgumentParser()
# 如果我们想要添加或者删减一些参数，可以添加parser.add_argument(**）的模块就可以
# 基于这个结构，我们就可以直接一条命令，就可以直接运行函数。
##  
if __name__ == '__main__':
  
  # Inputs for the main function
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--train_rate',
      help='training data ratio',
      default=0.8,
      type=str)
  parser.add_argument(
      '--seq_len',
      help='sequence length',
      default=7,
      type=int)
  parser.add_argument(
      '--model_type',
      choices=['rnn','gru','lstm','attention'],
      default='attention',
      type=str)
  parser.add_argument(
      '--h_dim',
      default=10,
      type=int)
  parser.add_argument(
      '--n_layer',
      default=3,
      type=int)
  parser.add_argument(
      '--batch_size',
      default=32,
      type=int)
  parser.add_argument(
      '--epoch',
      default=100,
      type=int)
  parser.add_argument(
      '--learning_rate',
      default=0.01,
      type=float)
  parser.add_argument(
      '--task',
      choices=['classification','regression'],
      default='regression',
      type=str)
  parser.add_argument(
      '--metric_name',
      choices=['mse','mae'],
      default='mae',
      type=str)
  
  args = parser.parse_args() 

  # Call main function  
  main(args)

In [None]:
# 这个就是python based command
!python3 main_time_series_prediction.py --train_rate 0.8 --seq_len 7 --task regression --model_type lstm --h_dim 10 --n_layer 3 --batch_size 32 --epoch 100 --learning_rate 0.01 --metric_name mae

# 代码分析

In [None]:
# 对应着数据集加载，data_loader.py
# Necessary Packages
import numpy as np
from utils import MinMaxScaler


def data_loader(train_rate = 0.8, seq_len = 7):
  """Loads Google stock data.
  
  Args:
    - train_rate: the ratio between training and testing sets
    - seq_len: sequence length
    
  Returns:
    - train_x: training feature
    - train_y: training labels
    - test_x: testing features
    - test_y: testing labels
  """
  
  # Load data
  ori_data = np.loadtxt('data/google.csv', delimiter=',', skiprows = 1)
  # 这个是一次性把所有的数据集都load进来。如果数据集size比较大怎么办。
  # 可能方法还是分批次的读取，比如说：在一次epoch中，overall 的数据集随机分成4份，每一份load进来训练，然后再load下一份。
  # 这样就实现了，数据频繁的load和数据集过大之间的平衡。随机分成的份数，是一个参数，如果参数过大，就是频繁的load很耽误时间；如果参数过小，那么数据size就太大，load不进来。
  # 这个参数就可以定义为，memory最大支撑的数据size小，分割数据集的分数。
  # 这里数据集，从CSV加载成numpy

  # 另外一个问题，读取的数据最后是一个大文件，而不是小文件，频繁的读取太耽误时间。

  # Reverse the time order
  reverse_data = ori_data[::-1]
  # 为啥reverse，这个可能是日K线，然后本身数据集是倒序排列的（也就是最新日期的在最上边），所以要倒过来。
  # Normalization
  norm_data = MinMaxScaler(reverse_data) #应该是归一化，应该是[0,1]
    
  # Build dataset
  data_x = []
  data_y = []
  


  for i in range(0, len(norm_data[:,0]) - seq_len): 
    # 这个就是sliding window，一个sample，一个sample，去shift，然后去预测下一个。这里是以starting point i为基准，所以，末尾减去。
    # Previous seq_len data as features
    temp_x = norm_data[i:i + seq_len,:] # size is 7 *5
    # Values at next time point as labels
    temp_y = norm_data[i + seq_len, [-1]] # label 仅仅是最后一个维度的，所以这个 1*1的数据
    data_x = data_x + [temp_x]
    data_y = data_y + [temp_y]
    
  data_x = np.asarray(data_x)
  data_y = np.asarray(data_y)
            
  # Train / test Division   
  idx = np.random.permutation(len(data_x)) 
  # 数据集整理好之后，就是打乱，然后做split。permutation就是做shuffle的作用。
  # shuffle的方法是，打乱数据的index，然后基于index做提取。
  train_idx = idx[:int(train_rate * len(data_x))] # shuffle后，0到比例之前的做training，后边的做test
  test_idx = idx[int(train_rate * len(data_x)):]
        
  train_x, test_x = data_x[train_idx, :, :], data_x[test_idx, :, :]
  train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
    
  return train_x, train_y, test_x, test_y

  # 这个函数就是一个数据提取，shuffle，提取的过程。
  # 主要是用来，做training + validation 和 test之间的split和shuffle。
  # 这个应该是epoch之外的操作，所以只需要一次就可以了，或者离线操作。


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
  # 用来做实验验证
  
  import numpy as np
  # Load data
  ori_data = np.loadtxt('data/google.csv', delimiter=',', skiprows = 1)
  print(ori_data[0,:])
  # Reverse the time order
  # 首先看python切片中双冒号的定义：list[<start>:<stop>:<step>]， 很显然这里就是倒序，而且只是第一个维度。
  # 同样可以用多维数组中，b[:, ::-1]
  reverse_data = ori_data[::-1]
  print(reverse_data[0,:])

[8.28659973e+02 8.33450012e+02 8.28349976e+02 1.24770000e+06
 8.31659973e+02]
[  568.00257    568.00257    552.922516 13100.         558.462551]


# 代码分析

In [None]:
#这个代码来自basic_rnn_lstm_gru.py
#主要作用就是定义一个类，这个类里边，定义各种model
# Necessary packages
import os
import tensorflow as tf
import numpy as np
from datetime import datetime
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint
from utils import binary_cross_entropy_loss, mse_loss, rnn_sequential


class GeneralRNN():
  """RNN predictive model to time-series.
  
  Attributes:
    - model_parameters:
      - task: classification or regression
      - model_type: 'rnn', 'lstm', or 'gru'
      - h_dim: hidden dimensions
      - n_layer: the number of layers
      - batch_size: the number of samples in each batch
      - epoch: the number of iteration epochs
      - learning_rate: the learning rate of model training
  """

  def __init__(self, model_parameters):

    #输入进来的各种参数
    self.task = model_parameters['task']
    self.model_type = model_parameters['model_type']
    self.h_dim = model_parameters['h_dim']
    self.n_layer = model_parameters['n_layer']
    self.batch_size = model_parameters['batch_size']
    self.epoch = model_parameters['epoch']
    self.learning_rate = model_parameters['learning_rate']
    
    # 断定要求的model是否支持
    assert self.model_type in ['rnn', 'lstm', 'gru']

    # Predictor model define
    # 初始化定义一个model。
    self.predictor_model = None

    # Set path for model saving
    model_path = 'tmp'
    if not os.path.exists(model_path):
      os.makedirs(model_path)
    self.save_file_name = '{}'.format(model_path) + \
                          datetime.now().strftime('%H%M%S') + '.hdf5'
  

  def _build_model(self, x, y):
    """Construct the model using feature and label statistics.
    
    Args:
      - x: features
      - y: labels
      
    Returns:
      - model: predictor model
    """    
    # Parameters
    h_dim = self.h_dim
    n_layer = self.n_layer
    dim = len(x[0, 0, :])
    max_seq_len = len(x[0, :, 0])

    model = tf.keras.Sequential()
    model.add(layers.Masking(mask_value=0., input_shape=(max_seq_len, dim)))

    for _ in range(n_layer - 1):
      model = rnn_sequential(model, self.model_type, h_dim, return_seq=True)

    model = rnn_sequential(model, self.model_type, h_dim, 
                           return_seq=False)
    adam = tf.keras.optimizers.Adam(learning_rate=self.learning_rate, 
                                    beta_1=0.9, beta_2=0.999, amsgrad=False)

    if self.task == 'classification':
      model.add(layers.Dense(y.shape[-1], activation='sigmoid'))
      model.compile(loss=binary_cross_entropy_loss, optimizer=adam)
      
    elif self.task == 'regression':
      model.add(layers.Dense(y.shape[-1], activation='linear'))
      model.compile(loss=mse_loss, optimizer=adam, metrics=['mse'])

    return model
  

  def fit(self, x, y):
    """Fit the predictor model.
    
    Args:
      - x: training features
      - y: training labels
      
    Returns:
      - self.predictor_model: trained predictor model
    """
    #这个时候自动把training set，分割成validation 和 training
    idx = np.random.permutation(len(x))
    train_idx = idx[:int(len(idx)*0.8)]
    valid_idx = idx[int(len(idx)*0.8):]
    
    train_x, train_y = x[train_idx], y[train_idx]
    valid_x, valid_y = x[valid_idx], y[valid_idx]
    
    self.predictor_model = self._build_model(train_x, train_y)

    # Callback for the best model saving
    save_best = ModelCheckpoint(self.save_file_name, monitor='val_loss',
                                mode='min', verbose=False,
                                save_best_only=True)

    # Train the model
    self.predictor_model.fit(train_x, train_y, 
                             batch_size=self.batch_size, epochs=self.epoch, 
                             validation_data=(valid_x, valid_y), 
                             callbacks=[save_best], verbose=True)

    self.predictor_model.load_weights(self.save_file_name)
    os.remove(self.save_file_name)

    return self.predictor_model
  
  
  def predict(self, test_x):
    """Return the temporal and feature importance.
    
    Args:
      - test_x: testing features
      
    Returns:
      - test_y_hat: predictions on testing set
    """
    test_y_hat = self.predictor_model.predict(test_x)
    return test_y_hat
