In [1]:
import numpy as np
import pandas as pd

In [57]:
def search_data(sequence_length, num_of_batches, label_strat_idx, num_of_predict,
                units, points_per_hour):
    '''
    :param sequence_length: 历史数据长度 int
    :param num_of_batches: 用于训练的batch大小，int
    :param label_strat_idx:
    :param num_of_predict:
    :param units:
    :param points_per_hour:
    :return:
    '''
    if points_per_hour < 0:
        raise ValueError("points_per_hour should be greater than 0!")
    if label_strat_idx + num_of_predict > sequence_length:
        return None
    x_idx = []
    for i in range(1,num_of_batches+1):
        start_idx = label_strat_idx - points_per_hour * units * i
        end_idx = start_idx + num_of_predict
        if start_idx >=0 :
            x_idx.append((start_idx,end_idx))
        else:
            return None
    if len(x_idx) != num_of_batches:
        return None

    return x_idx[::-1]
#生成样本序列函数
def get_sample_indices(data_sequence,num_of_weeks,num_of_days,num_of_hours,
                       label_start_idx,num_of_predict,points_per_hour=12):
    '''
    :param data_sequence:数据序列，(sequence_length,num_of_vertices,num_of_features)
    :param num_of_weeks:周周期数 int
    :param num_of_days:天周期数 int
    :param num_of_hours:近期周期数 int
    :param label_start_idx:预测目标的开始下标 int
    :param num_of_predict: 每个样本的预测点数 int
    :param points_per_hour: 每小时的点数 int 默认为12
    :return:
    '''
    week_indices = search_data(data_sequence.shape[0], num_of_weeks,label_start_idx,
                               num_of_predict, 7*24, points_per_hour)
    if not week_indices:
        return None

    day_indices = search_data(data_sequence.shape[0], num_of_days, label_start_idx,
                              num_of_predict, 24, points_per_hour)
    if not day_indices:
        return None

    hour_indices = search_data(data_sequence.shape[0], num_of_hours, label_start_idx,
                              num_of_predict, 1, points_per_hour)
    if not hour_indices:
        return None

    week_sample = np.concatenate([data_sequence[i:j]
                                  for i,j in week_indices], axis=0)
    day_sample = np.concatenate([data_sequence[i:j]
                                 for i,j in day_indices], axis=0)
    hour_sample = np.concatenate([data_sequence[i:j]
                                 for i,j in hour_indices], axis=0)
    target = data_sequence[label_start_idx:label_start_idx+num_of_predict]

    return week_sample, day_sample, hour_sample, target

In [62]:
X = pd.read_csv('D:/pyprojects/TreeCN_GCNN-master/data_set/SmallScaleAggregation/V_flow_50.csv',header=None).head(8640).to_numpy(np.float32)

X = np.reshape(X,(X.shape[0],X.shape[1],1)) #8640,50,1
print(X.shape)


(8640, 50, 1)


In [95]:
all_sample = []
num_of_weeks = 1
num_of_days = 1
num_of_hours = 2
num_of_predict = 12
points_per_hour = 12

for idx in range(X.shape[0]):
    sample = get_sample_indices(X,num_of_weeks,num_of_days,num_of_hours,idx,
                                num_of_predict,points_per_hour)
    if  not sample:
        continue

    week_sample, day_sample, hour_sample, target = sample
    # break
    '''周周期数据，天周期数据，时周期数据按增加的维度进行拼接'''
    all_sample.append((
        np.expand_dims(week_sample,axis=0).transpose((0,2,3,1)),
        np.expand_dims(day_sample,axis=0).transpose((0,2,3,1)),
        np.expand_dims(hour_sample,axis=0).transpose((0,2,3,1)),
        np.expand_dims(target,axis=0).transpose((0,2,3,1))[:,:,0,:]
    ))
    #数据划分线
split_line1 = int(len(all_sample)*0.6)
split_line2 = int(len(all_sample)*0.8)

#生成训练集、验证集、测试集
training_set = [np.concatenate(i, axis=0)
                for i in zip(*all_sample[:split_line1])]
validation_set = [np.concatenate(i, axis=0)
                for i in zip(*all_sample[split_line1:split_line2])]
testing_set = [np.concatenate(i, axis=0)
                for i in zip(*all_sample[split_line2:])]
train_week, train_day, train_hour, train_target = training_set
val_week, val_day, val_hour, val_target = validation_set
test_week, test_day, test_hour, test_target = testing_set

print('training data: week: {}, day: {}, recent: {}, target: {}'.format(
    train_week.shape, train_day.shape,train_hour.shape, train_target.shape))
print('validation data: week: {}, day: {}, recent: {}, target: {}'.format(
    val_week.shape, val_day.shape, val_hour.shape, val_target.shape))
print('testing data: week: {}, day: {}, recent: {}, target: {}'.format(
    test_week.shape, test_day.shape, test_hour.shape, test_target.shape))



training data: week: (3967, 50, 1, 12), day: (3967, 50, 1, 12), recent: (3967, 50, 1, 24), target: (3967, 50, 12)
validation data: week: (1323, 50, 1, 12), day: (1323, 50, 1, 12), recent: (1323, 50, 1, 24), target: (1323, 50, 12)
testing data: week: (1323, 50, 1, 12), day: (1323, 50, 1, 12), recent: (1323, 50, 1, 24), target: (1323, 50, 12)


In [97]:
def normalization(train, val, test):
    """
    Parameters
    ----------
    train, val, test: np.ndarray

    Returns
    ----------
    stats: dict, two keys: mean and std

    train_norm, val_norm, test_norm: np.ndarray,
                                     shape is the same as original

    """

    assert train.shape[1:] == val.shape[1:] and val.shape[1:] == test.shape[1:]

    mean = train.mean(axis=0, keepdims=True)
    std = train.std(axis=0, keepdims=True)

    def normalize(x):
        return (x - mean) / std

    train_norm = normalize(train)  # wd: ??
    val_norm = normalize(val)
    test_norm = normalize(test)

    return {'mean': mean, 'std': std}, train_norm, val_norm, test_norm

In [99]:
(week_stats, train_week_norm,val_week_norm, test_week_norm) = normalization(train_week,
                                                    val_week,
                                                    test_week)

(day_stats, train_day_norm,val_day_norm, test_day_norm) = normalization(train_day,
                                                  val_day,
                                                  test_day)
(recent_stats, train_recent_norm,val_recent_norm, test_recent_norm) = \
    normalization(train_hour,val_hour,test_hour)

In [73]:
print(len(all_sample))
print(all_sample)

1
[(array([[[[ 22.,  23.,  17.,  20.,  14.,  14.,   4.,  30.,   8.,  20.,
           24.,  25.]],

        [[ 23.,  22.,  19.,  22.,  14.,  15.,   4.,  27.,   7.,  19.,
           23.,  25.]],

        [[ 23.,  20.,  19.,  24.,  12.,  15.,   3.,  29.,   8.,  18.,
           24.,  24.]],

        [[ 21.,  22.,  18.,  19.,  14.,  13.,   4.,  32.,   7.,  19.,
           21.,  25.]],

        [[ 23.,  22.,  19.,  19.,  13.,  16.,   4.,  32.,   7.,  17.,
           23.,  26.]],

        [[ 23.,  21.,  19.,  23.,  14.,  15.,   4.,  29.,   8.,  19.,
           22.,  27.]],

        [[ 23.,  22.,  21.,  23.,  16.,  13.,   4.,  32.,   8.,  20.,
           25.,  27.]],

        [[ 22.,  22.,  20.,  18.,  13.,  13.,   4.,  30.,   7.,  19.,
           22.,  26.]],

        [[ 24.,  22.,  15.,  21.,  16.,  13.,   4.,  29.,   7.,  20.,
           24.,  27.]],

        [[ 22.,  23.,  17.,  22.,  14.,  14.,   4.,  28.,   7.,  17.,
           25.,  23.]],

        [[ 41.,  38.,  37.,  38.,  25.,  29., 