In [47]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import pyecharts
from pyecharts import options as opts

## Read original Data

In [2]:
original = pd.read_csv('./data.csv', encoding='gbk')
original.columns.values

  interactivity=interactivity, compiler=compiler, result=result)


array(['ID', '批次', '牌号', '时间', '生产班次', '生产班别', '设备状态', '烘前叶丝流量设定值',
       '烘前叶丝流量', '烘前叶丝流量累积量', 'SIROX蒸汽流量', '热风温度', '筒壁1区温度设定值',
       '筒壁1区温度实际值', '筒壁2区温度设定值', '筒壁2区温度实际值', '脱水量', '排潮风门开度', '罩压力',
       '热风速度设定值', '热风速度实际值', '出口温度', 'SIROX水分增加', '入口水分', '出口水分设定值',
       '出口水分', '冷凝水温度1区', '冷凝水温度2区', '滚筒转速', '蒸汽压力', '区域1预热阶段滚筒温度额定值',
       '区域2预热阶段滚筒温度额定值', '工作点脱水', '区域1滚筒温度标准工作点', '区域2滚筒温度标准工作点',
       '区域1筒壁蒸汽压力', '区域2筒壁蒸汽压力', '罩压力设定值'], dtype=object)

In [None]:
columns = [
 '时间', '牌号', '设备状态', \
 '入口水分', '出口水分', '出口水分设定值', \
 '热风速度设定值',  '热风速度实际值', \
 '烘前叶丝流量设定值', '烘前叶丝流量', \
 '筒壁1区温度设定值', '筒壁1区温度实际值',   \
 '筒壁2区温度实际值', '筒壁2区温度设定值'
]

data = original[columns]
data['出口水分差值'] = data['出口水分'] - data['出口水分设定值']

# 对总的数据进行预处理

In [5]:
# drop nan
data = data.dropna()

# 牌号莫名奇妙的存储时间，drop这些行
index = data[[isinstance(item, str) and item.startswith('2019') for item in data['牌号']]].index
data = data.drop(index, axis=0)

# 烘前叶丝流量 == 0，表示设备没有运行
# index = data[data['烘前叶丝流量'] == 0].index
# data = data.drop(index, axis=0)

# Formate Time
# def format_time(time_str):
#     try:
#         return datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
#     except ValueError:
#         return datetime.strptime(time_str, '%Y-%m-%d')
# data['时间'] = data['时间'].map(lambda x: format_time(x))

# 按照时间进行排序
data = data.sort_values(by=['时间'], ascending=True)


In [100]:
# 查看设备状态的种类
for index, status in enumerate(data['设备状态'].unique()):
    print(status, ' ', data[data['设备状态'] == status].shape[0])

# 查看牌号的种类
for index, status in enumerate(data['牌号'].unique()):
    print(status, ' ', data[data['牌号'] == status].shape[0])

生产   1151341
收尾   114163
准备   426681
启动   170064
2   8762
预热   2942
256   10702
1   92843
-128   601
264   1
512   164
2048   513
4096   2383
256   924
1   994
HSX###   204675
KPH###   118479
TH####A   479419
Txy###   211854
TG####A   640205
DQMr##   23464
HsxY##   299791
ThQD##A   5191


# 取得Sample Data + 预处理

In [63]:
sample = data[np.logical_and(data['牌号'] == 'HSX###', data['设备状态'] == '生产')]
sample.shape

(135736, 15)

In [64]:
# 将流量不正常的行过滤
FLOW_BIAS_DROP_RATE = 0.001
flow_mean = sample['烘前叶丝流量设定值'].mean()
flow_mask = np.abs(sample['烘前叶丝流量'] - flow_mean) < FLOW_BIAS_DROP_RATE * flow_mean
sample = sample[flow_mask]

sample = sample.reset_index(drop=True) 
sample.shape

(103635, 15)

In [8]:
sample[['入口水分', '出口水分', '热风速度实际值', '烘前叶丝流量', '筒壁1区温度实际值', '筒壁2区温度实际值']].describe()

Unnamed: 0,入口水分,出口水分,热风速度实际值,烘前叶丝流量,筒壁1区温度实际值,筒壁2区温度实际值
count,103635.0,103635.0,103635.0,103635.0,103635.0,103635.0
mean,21.746824,13.803369,0.389861,4799.984507,136.699793,136.703925
std,0.356351,0.07805,0.033487,2.451384,2.383627,2.383455
min,20.17758,13.29,0.296547,4795.201,130.4426,130.6654
25%,21.47749,13.75,0.361464,4798.074,134.8711,134.8698
50%,21.64373,13.8,0.39841,4799.986,135.8768,135.8789
75%,22.07072,13.85,0.410972,4801.904,139.0506,139.05835
max,22.7035,14.16,0.467012,4804.799,143.1458,143.224


In [1126]:
sample.head()

Unnamed: 0,时间,牌号,设备状态,入口水分,出口水分,出口水分设定值,热风速度设定值,热风速度实际值,烘前叶丝流量设定值,烘前叶丝流量,筒壁1区温度设定值,筒壁1区温度实际值,筒壁2区温度实际值,筒壁2区温度设定值,出口水分差值
0,2019-10-10 10:00:14,HSX###,生产,21.58618,13.77,13.8,0.35,0.35173,4800,4799.62,134.0374,133.9196,133.9923,134.171,-0.03
1,2019-10-10 10:00:16,HSX###,生产,21.57996,13.74,13.8,0.35,0.351971,4800,4797.677,134.0319,133.9027,133.9809,134.1575,-0.06
2,2019-10-10 10:00:26,HSX###,生产,21.57745,13.77,13.8,0.35,0.349748,4800,4796.059,133.8468,133.8324,133.9454,133.9916,-0.03
3,2019-10-10 10:00:32,HSX###,生产,21.56749,13.81,13.8,0.35,0.350375,4800,4797.835,133.9349,133.7832,133.924,134.0574,0.01
4,2019-10-10 10:00:34,HSX###,生产,21.5576,13.78,13.8,0.35,0.350615,4800,4796.674,133.9847,133.7702,133.918,134.0972,-0.02


# 按时间进行分割

In [65]:
def format_time(time_str):
    try:
        return datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        return datetime.strptime(time_str, '%Y-%m-%d')
sample['时间'] = sample['时间'].map(lambda x: format_time(x))

In [66]:
# 如果两个点相差 SPLIT_INTERVAL 秒，进行分割
SPLIT_INTERVAL = 300
sample_time_diff = (sample['时间'][1:] - sample['时间'].shift(1)[1:]).map(lambda x: x.seconds)

split_point = sample_time_diff[sample_time_diff > SPLIT_INTERVAL].index
split_point = split_point.insert(0, 0)
split_point = split_point.insert(len(split_point), len(sample))
split_data = []
for i in range(1, len(split_point)):
    split_data.append(sample[split_point[i-1]: split_point[i]])

# 画图

In [51]:
from pyecharts.charts import Line, Page

def line_with_constant_benchmark(data, title, benchmark=None, ratio=None, axis_offset=0, zoom=True, range_=None) -> Line:
    marklines = []
    if benchmark and ratio:
        marklines += [
            opts.MarkLineItem(y=benchmark, name="Benchmark"),
            opts.MarkLineItem(y=benchmark * (1 + ratio), name="Upper"),
            opts.MarkLineItem(y=benchmark * (1 - ratio), name="Lower"),
        ]
    if benchmark and not ratio:
        marklines += [
            opts.MarkLineItem(y=benchmark, name="Benchmark"),
        ]
    if range_ is not None:
        marklines += [
            opts.MarkLineItem(x=range_[0], name="Start"),
            opts.MarkLineItem(x=range_[1], name="End"),
        ]
    line_ = (
        Line(
            init_opts=opts.InitOpts(
                animation_opts=opts.AnimationOpts(
                    animation=False
                )
            ))
        .add_xaxis(range(len(data)))
        .add_yaxis(
            title,
            data[title].values,
            label_opts=opts.LabelOpts(is_show=False)
        ).set_global_opts(
            datazoom_opts=opts.DataZoomOpts() if zoom else None,
            yaxis_opts=opts.AxisOpts(
                min_=min(data[title].values) - axis_offset,
                max_=max(data[title].values) + axis_offset
            ),
        ).set_series_opts(
            label_opts=opts.LabelOpts(is_show=False),
            markline_opts=opts.MarkLineOpts(
                data=marklines
            ),
        )
    )
    return line_

def line_with_variated_benchmark(data, title, benchmark_title, axis_offset=0, zoom=True, range_=None) -> Line:
    marklines = []
    if range_ is not None:
        marklines += [
            opts.MarkLineItem(x=range_[0], name="Start"),
            opts.MarkLineItem(x=range_[1], name="End"),
        ]
    line_ = (
        Line(
            init_opts=opts.InitOpts(
                animation_opts=opts.AnimationOpts(
                    animation=False
                )
            ))
        .add_xaxis(range(len(data)))
        .add_yaxis(
            title,
            data[title].values,
            label_opts=opts.LabelOpts(is_show=False)
        )
        .add_yaxis(
            benchmark_title,
            data[benchmark_title].values,
            label_opts=opts.LabelOpts(is_show=False)
        ).set_global_opts(
            datazoom_opts=opts.DataZoomOpts() if zoom else None,
            yaxis_opts=opts.AxisOpts(
                min_=min(data[title].values) - axis_offset,
                max_=max(data[title].values) + axis_offset
            ),
        ).set_series_opts(
            markline_opts=opts.MarkLineOpts(
                data=marklines
            )
        )
    )
    return line_
    
    
def draw_split_data(split_data_item, title, zoom=True, range_=None, dir_=None):
    # split_data_item['入口水分'] = split_data_item['入口水分'].shift(-100, fill_value=0)
    page = Page()
    page.add(line_with_constant_benchmark(
        split_data_item, 
        "烘前叶丝流量",
        np.mean(sample['烘前叶丝流量设定值']), 
        axis_offset=5,
        zoom=zoom
    ))
    page.add(line_with_constant_benchmark(
        split_data_item, 
        "出口水分", 
        np.mean(sample['出口水分设定值']), 
        BIAS_ERROR_RATIO,
        zoom=zoom,
        range_=range_
    ))
    page.add(line_with_constant_benchmark(
        split_data_item, 
        "入口水分",
        zoom=zoom
    ))
    page.add(line_with_variated_benchmark(
        split_data_item, 
        "热风速度实际值", 
        '热风速度设定值',
        0.01,
        zoom=zoom,
        range_=range_
    ))
    page.add(line_with_variated_benchmark(
        split_data_item, 
        "筒壁1区温度实际值", 
        '筒壁1区温度设定值',
        0.5,
        zoom=zoom,
        range_=range_
    ))
    page.add(line_with_variated_benchmark(
        split_data_item, 
        "筒壁2区温度实际值",
        "筒壁2区温度设定值",
        0.5,
        zoom=zoom,
        range_=range_
    ))
    if dir_:
        make_dir('./plot/' + dir_ )
        page.render('./plot/'+ dir_ + '/'+ title + '.html')
    else:
        page.render('./plot/' + title + '.html')

def make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path) 


In [1558]:
draw_split_data(split_data[20], '20')

# 抽取训练数据
选取出口水分在STABLE_WINDOWS_SIZE内都是稳定的一段时间，然后向前找到 T-TIME_LAG_1_START 到 T-TIME_LAG_1_END 时间内的所有特征，来预测 T-TIME_LAG_1_END 时刻需要调整的参数值

Label即 T-TIME_LAG_1_END 到 T 时刻的各种参数的差值

In [129]:
feature_column = ['出口水分差值', '热风速度实际值', '筒壁1区温度实际值', '筒壁2区温度实际值']
label_column = ['热风速度设定值', '筒壁1区温度设定值', '筒壁2区温度设定值']

TIME_LAG_1_END = 2
TIME_LAG_1_START = 35
TIME_LAG_2 = 130
STABLE_WINDOWS_SIZE = 6
BIAS_ERROR_RATIO = 0.002 # 0.0015
LABLE_WINDOWS_SIZE = 2

In [130]:
def calc_feature(item_,
                 stable_start: int,
                 time_lag_2=TIME_LAG_2,
                 time_lag_1_start=TIME_LAG_1_START,
                 time_lag_1_end=TIME_LAG_1_END) -> []:
    input_humidity = item_['入口水分'].iloc[stable_start - time_lag_2 - time_lag_1_start: stable_start - time_lag_2 - time_lag_1_end]
    feature_slice = item_[feature_column].iloc[stable_start - time_lag_1_start: stable_start - time_lag_1_end]

    return np.concatenate([
        [
            calc_integral(input_humidity.values),
            input_humidity.std(),
            input_humidity.skew(),
            input_humidity.kurtosis(),
        ],
        calc_integral(feature_slice.values),
        feature_slice.mean().values,
        feature_slice.std().values,
        feature_slice.skew().values,
        feature_slice.kurtosis().values,
    ])
    

def calc_integral(data):
    if len(data) <= 1:
        return 0
    sum_ = sum(data)
    return sum_ - (data[0] + data[len(data) - 1]) / 2

def calc_lable(item_, end: int) -> []:
    real_start = end
    real_end = end + STABLE_WINDOWS_SIZE
    current_start = end - TIME_LAG_1_END - LABLE_WINDOWS_SIZE
    current_end = end - TIME_LAG_1_END
    
    real_ = np.mean(item_[label_column].iloc[real_start: real_end].values, axis=0)
    current_ = np.mean(item_[label_column].iloc[current_start: current_end].values, axis=0)
    return real_ - current_

In [131]:
def generate_data(split_data):
    setting = np.mean(sample['出口水分设定值'])
    sample_train_dataset = []
    sample_train_label = []
    sample_data_windows = [] 
    for index, item in enumerate(split_data):
        length = len(item)
        humidity = item['出口水分']
        wind_speed = item['热风速度设定值']

        # 保证出口水分在一定时间的恒定值内
        for stable_start in range(TIME_LAG_2 + TIME_LAG_1_START, length - STABLE_WINDOWS_SIZE):
            stable_end = stable_start + STABLE_WINDOWS_SIZE
            
            if np.all(np.abs(humidity[stable_start: stable_end] - setting) < setting * BIAS_ERROR_RATIO):
                # 向前取 [35, 6], [30, 6], [25, 6] 的特征做训练
                for time_lag_1_start_step in range(TIME_LAG_1_START, TIME_LAG_1_END + 15, -5):
                    sample_data_windows.append([index, stable_start])
                    sample_train_dataset.append(calc_feature(item, stable_start, TIME_LAG_2, time_lag_1_start_step, TIME_LAG_1_END))
                    sample_train_label.append(calc_lable(item, stable_start))

        # 保证风速调整后，出口水分是正常的
#         for adjust_start in range(TIME_LAG_2 + TIME_LAG_1_START, length - TIME_LAG_1_END):
#             if np.abs(wind_speed.iloc[adjust_start - 1] - wind_speed.iloc[adjust_start]) > 0 \
#                     and np.abs(item['出口水分'] - setting).iloc[TIME_LAG_1_END + adjust_start] < setting * BIAS_ERROR_RATIO:

#                 for time_lag_1_start_step in range(TIME_LAG_1_START, TIME_LAG_1_END + 15, -5):
#                     sample_data_windows.append([index, adjust_start + TIME_LAG_1_END])
#                     sample_train_dataset.append(calc_feature(item, adjust_start + TIME_LAG_1_END, TIME_LAG_2, time_lag_1_start_step, TIME_LAG_1_END))
#                     sample_train_label.append(calc_lable(item, adjust_start + TIME_LAG_1_END))
    
    sample_train_dataset = np.array(sample_train_dataset)
    sample_train_label = np.array(sample_train_label)
    sample_data_windows = np.array(sample_data_windows)
    return sample_train_dataset, sample_train_label, sample_data_windows

sample_train_dataset, sample_train_label, sample_data_windows = generate_data(split_data)

In [132]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(sample_train_dataset, sample_train_label, sample_data_windows, test_size=0.2, random_state=42)

print('Train set: ', len(X_train))
print('Test set: ', len(X_test))

Train set:  15283
Test set:  3821


In [1686]:
# PCA 
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, svd_solver='full')

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(pca.explained_variance_ratio_) 
print(pca.singular_values_)  

In [97]:
pd.DataFrame(sample_train_dataset).corr()
# ['入口水分', '出口水分差值', '热风速度实际值', '筒壁1区温度实际值', '筒壁2区温度实际值']
# ['积分'，'均值', '方差', ‘Skew’, 'Kurtosis']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1.0,0.169438,-0.013232,0.011484,0.030079,0.956283,0.999011,0.999008,0.014759,0.032725,...,0.219278,0.231967,-0.001669,-0.02522,0.008469,0.010967,-0.018785,0.018713,-0.020364,-0.019669
1,0.169438,1.0,-0.040474,-0.187041,0.07947,0.161314,0.175888,0.176004,0.076301,0.005398,...,0.059203,0.076088,-0.023158,-0.0156,-0.018536,-0.017929,0.012616,-0.027965,0.02851,0.050185
2,-0.013232,-0.040474,1.0,0.065739,0.040722,-0.025479,-0.010208,-0.00988,0.039698,-0.049245,...,-0.006598,-0.000568,-0.022341,0.012475,0.007659,-0.001698,0.004045,0.01029,0.018594,0.002674
3,0.011484,-0.187041,0.065739,1.0,0.008188,0.022335,0.010572,0.010555,0.007377,0.023664,...,0.024441,0.004892,-0.008729,0.003325,-0.01088,-0.005214,-0.011749,-0.02212,-0.010351,-0.00723
4,0.030079,0.07947,0.040722,0.008188,1.0,0.014236,0.03932,0.039408,0.975354,-0.037324,...,0.049685,0.055553,-0.202583,-0.032494,-0.106834,-0.105128,-0.054746,-0.000968,0.007479,0.031012
5,0.956283,0.161314,-0.025479,0.022335,0.014236,1.0,0.951404,0.951346,0.000108,0.309589,...,0.172038,0.193158,0.010182,-0.029495,-0.006546,-0.000857,-0.0117,0.004337,-0.017073,-0.025122
6,0.999011,0.175888,-0.010208,0.010572,0.03932,0.951404,1.0,0.999986,0.0238,0.017905,...,0.219491,0.233865,-0.0033,-0.02631,0.001866,0.005461,-0.01893,0.018078,-0.019415,-0.017517
7,0.999008,0.176004,-0.00988,0.010555,0.039408,0.951346,0.999986,1.0,0.023892,0.017709,...,0.219158,0.23373,-0.003481,-0.026142,0.002378,0.004465,-0.018874,0.018083,-0.019368,-0.017415
8,0.014759,0.076301,0.039698,0.007377,0.975354,0.000108,0.0238,0.023892,1.0,-0.038174,...,0.044621,0.046367,-0.197777,-0.033415,-0.111235,-0.108748,-0.055215,-0.008483,0.011715,0.029249
9,0.032725,0.005398,-0.049245,0.023664,-0.037324,0.309589,0.017905,0.017709,-0.038174,1.0,...,-0.13939,-0.117047,0.04698,-0.00965,-0.042086,-0.031795,0.010713,-0.053608,-0.000252,-0.021627


# 定义模型

In [133]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression(normalize=True)
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

# 验证集阶段

In [134]:
def compute_rmse(y_true, y_pred):
    return np.sqrt(np.mean(np.square(y_pred - y_true))) 

def compute_mse(y_true, y_pred):
    return np.mean(np.square(y_pred - y_true))

def compute_mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

def compute_mae_percent(y_true, y_pred):
    error_ = np.abs(y_pred - y_true)
    percent_ = np.abs(error_ / y_true)
    return np.mean(percent_) * 100
    
def compute_r2(y_true, y_pred):
    SS_res =  np.sum(np.square( y_true - y_pred ))
    SS_tot = np.sum(np.square( y_true - np.mean(y_true) ) )
    return (1 - SS_res / (SS_tot + 1e-10))

def compute_adjust_r2(y_true, y_pred):
    """
    R2在特征数量增加的情况下，是一定会增大的。
    Adjust R2就是为了适应特征数量和样本数量的变化而引入的，表示R2的绝对变化。只有当真正拟合了样本的时候Adjust R2才会增大
    """
    r2 = compute_r2(y_true, y_pred)
    n = y_true.shape[0]
    p = y_true.shape[1]
    return 1 - ((1 - r2) * (n - 1)) / (n - p - 1)


In [135]:
pred = clf.predict(X_test)

print('mse: ', round(compute_mse(pred, y_test), 5))
print('mae: ', round(compute_mae(pred, y_test), 5))
print('percentage mae% : ', round(compute_mae_percent(pred, y_test), 5))
print('adjust r2: ', round(compute_adjust_r2(pred, y_test), 5))

mse:  0.0049
mae:  0.04224
percentage mae% :  837.31174
adjust r2:  0.2337


In [127]:
np.set_printoptions(suppress=True)

def print_result(range_=40, dir_=None, save_result=False):
    str_ = []
    for i in range(range_):
        item = split_data[index_test[i][0]]
        item = item[index_test[i][1] - TIME_LAG_2: index_test[i][1] + STABLE_WINDOWS_SIZE]
        
        draw_split_data(item, 
                        str(index_test[i][0]) + '-' + str(index_test[i][1]), 
                        zoom=False, 
                        range_=[TIME_LAG_2 - TIME_LAG_1_START, TIME_LAG_2 - TIME_LAG_1_END],
                        dir_=dir_
                       )
        if not save_result:
            print(index_test[i][0], '-', index_test[i][1], ': \t' , np.round(pred[i], 4), '\t', np.round(y_test[i], 4))
        else:
            str_.append(str(index_test[i][0]) + '-' + str(index_test[i][1]) + ': [' + ' '.join([str(x) for x in np.round(pred[i], 4)]) + '], [' + ' '.join([str(x) for x in np.round(y_test[i], 4)]) + ']')
    
    if save_result:
        fw = open('./plot/' + dir_ + '/result.txt', 'w')
        for line in str_:
            fw.write(line)
            fw.write("\n") 

print_result(dir_='val')


51 - 1437 : 	 [-0.      0.0362  0.0181] 	 [ 0.      0.0227 -0.0698]
75 - 716 : 	 [ 0.     -0.0156 -0.0435] 	 [0.     0.0417 0.0081]
42 - 274 : 	 [ 0.0001 -0.1056 -0.0651] 	 [ 0.     -0.2038 -0.1529]
14 - 1040 : 	 [-0.0001  0.0344  0.0297] 	 [ 0.     -0.1051 -0.1254]
21 - 796 : 	 [0.0002 0.0831 0.0725] 	 [0.0015 0.1798 0.1436]
24 - 453 : 	 [ 0.0001 -0.1508 -0.146 ] 	 [ 0.0005 -0.0107 -0.0498]
61 - 926 : 	 [-0.0001  0.0232  0.029 ] 	 [0.     0.1024 0.0266]
26 - 1515 : 	 [-0.0001 -0.019   0.0478] 	 [0.     0.1189 0.107 ]
18 - 1565 : 	 [-0.      0.0929  0.0849] 	 [-0.      0.1864  0.1523]
44 - 329 : 	 [ 0.0001 -0.1043 -0.0945] 	 [ 0.     -0.1554 -0.1757]
44 - 225 : 	 [-0.     -0.1273 -0.1322] 	 [ 0.     -0.0011 -0.01  ]
45 - 1085 : 	 [-0.      0.1464  0.1234] 	 [0.     0.4082 0.3461]
64 - 772 : 	 [0.     0.0361 0.0524] 	 [0.     0.0059 0.1287]
64 - 807 : 	 [ 0.     -0.0471 -0.068 ] 	 [ 0.     -0.8211 -0.5161]
46 - 925 : 	 [ 0.     -0.1461 -0.1826] 	 [-0.     -0.2829 -0.3114]
30 - 1063 : 	 

# 测试集合测试
滑动窗口不断滑动，然后取数据进行测试

In [128]:
def generate_real_test_data(item):
    length = len(item)
    final_X_test_ = []
    final_X_index_ = []
    
    for item_index in range(TIME_LAG_2 + TIME_LAG_1_START, length - STABLE_WINDOWS_SIZE, 5):
        final_X_test_.append(calc_feature(item, item_index))
        final_X_index_.append(item_index)
        
    return np.array(final_X_test_), np.array(final_X_index_)

final_test_data = split_data[index_test[30, 0]]
final_X_test, final_X_index = generate_real_test_data(final_test_data)
pred = clf.predict(final_X_test)


# Plot
for index_, item_ in enumerate(final_X_index):
    draw_split_data(final_test_data[item_- TIME_LAG_2: item_ + 20],
                    title=str(str(final_X_index[index_])), 
                    zoom=False, 
                    range_=[TIME_LAG_2 - TIME_LAG_1_START, TIME_LAG_2 - TIME_LAG_1_END],
                    dir_='final'
                   )
for index_, item_ in enumerate(final_X_index):
    print(item_, np.round(pred[index_], 4))

165 [ 0.0001 -0.0039 -0.0395]
170 [ 0.0001 -0.0746 -0.1054]
175 [ 0.0001 -0.1177 -0.1379]
180 [ 0.0001 -0.176  -0.1949]
185 [ 0.0001 -0.2241 -0.242 ]
190 [ 0.0001 -0.2258 -0.2438]
195 [ 0.0001 -0.2038 -0.2197]
200 [ 0.0001 -0.1812 -0.1963]
205 [ 0.0001 -0.1464 -0.1613]
210 [ 0.0001 -0.1218 -0.1336]
215 [ 0.0001 -0.118  -0.1273]
220 [ 0.     -0.09   -0.0942]
225 [ 0.     -0.0745 -0.073 ]
230 [ 0.     -0.051  -0.0435]
235 [ 0.     -0.0206 -0.0052]
240 [ 0.     -0.0093  0.0118]
245 [ 0.     -0.0095  0.0217]
250 [ 0.     -0.0335  0.0055]
255 [ 0.     -0.0609 -0.0153]
260 [ 0.     -0.0431  0.003 ]
265 [ 0.     -0.0462  0.0001]
270 [ 0.0001 -0.073  -0.0337]
275 [ 0.0001 -0.0855 -0.06  ]
280 [ 0.0001 -0.1093 -0.0911]
285 [ 0.0001 -0.1043 -0.103 ]
290 [ 0.0001 -0.1079 -0.1207]
295 [ 0.0001 -0.1149 -0.1353]
300 [ 0.0001 -0.0971 -0.1278]
305 [ 0.0001 -0.067  -0.0984]
310 [ 0.0002 -0.0634 -0.0905]
315 [ 0.0001 -0.0716 -0.0951]
320 [ 0.0001 -0.0747 -0.0929]
325 [ 0.0001 -0.0771 -0.0887]
330 [ 0.00

# 参数寻找

In [None]:
# 0.0015   5   35   5
TIME_LAG_2 = 130
LABLE_WINDOWS_SIZE = 1

time_lag_1_end_set = [3, 4, 5, 6, 7]
time_lag_1_start_set = [35, 40, 45, 50, 55]
stable_windows_size_set = [5, 6, 7]

for _, TIME_LAG_1_END in enumerate(time_lag_1_end_set):
    for _, TIME_LAG_1_START in enumerate(time_lag_1_start_set):
        for _, STABLE_WINDOWS_SIZE in enumerate(stable_windows_size_set):
            sample_train_dataset, sample_train_label, sample_data_windows = generate_data(split_data)
            X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(sample_train_dataset, sample_train_label, sample_data_windows, test_size=0.2, random_state=42)
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)

            print_result(range_=20, 
                         dir_=str(BIAS_ERROR_RATIO) + '-' + str(TIME_LAG_1_END) + '-' + str(TIME_LAG_1_START) + '-' + str(STABLE_WINDOWS_SIZE), 
                         save_result=True
                        )

            print(TIME_LAG_1_END, ' ', TIME_LAG_1_START, ' ', STABLE_WINDOWS_SIZE)
            print('mae: ', round(compute_mae(pred, y_test), 5))
            print('mse: ', round(compute_mse(pred, y_test), 5))
            print('adjust r2: ', round(compute_adjust_r2(pred, y_test), 5))
            print('----------------------------')        