In [16]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta
import time
import glob

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

in_dir = 'D:\\Toppan\\2017-11-20 全データ\\処理済(総量)\\vectorized_keikaku_shibata'
files = [f for f in os.listdir(in_dir)]

voc_dir_base = 'D:\\Toppan\\2017-11-20 全データ\\データ\\'


base_out_dir = 'D:\\Toppan\\2017-11-20 全データ\\解析結果(総量)\\年間モデル\\追加学習なし'

# train and test data
energies = ['蒸気', '冷水', '電力']
month = [ '16年11月', '16年12月', '17年1月', '17年2月', '17年3月', '17年4月',
         '17年5月', '17年6月', '17年7月', '17年8月', '17年9月', '17年10月']

month_file = [  '201611010800.xlsx', '201612010800.xlsx', 
                '201701010800.xlsx', '201702010800.xlsx', '201703010800.xlsx',
                '201704010800.xlsx', '201705010800.xlsx', '201706010800.xlsx',
                '201707010800.xlsx', '201708010800.xlsx', '201709010800.xlsx',
                '201710010800.xlsx']

In [17]:
def read_df(file, month, sheet_name):
    
    # data
    df = pd.read_excel(os.path.join(in_dir, file), sheet_name=sheet_name)
    
    # April special case
    if month == '17年4月':
        df = df.iloc[:358]
    
    # voc
    voc_dir = os.path.join(voc_dir_base, month, 'VOC再利用追加')
    voc_f = glob.glob(voc_dir + '/*.csv')[0]
    df_voc = pd.read_csv(voc_f, index_col=0, 
                         encoding='shift-jis', 
                         engine='python', 
                         parse_dates=True).fillna(0)
    
    df.columns = [
        '総稼動時間-3', '計画投入数(Ｒ)-3', '外気温度-3', '外気湿度-3', '計画色数-3', '段取-3', 
        '計画停止-3', '計画開始完了数-3', sheet_name + '-3',
        
        '総稼動時間-2', '計画投入数(Ｒ)-2', '外気温度-2', '外気湿度-2', '計画色数-2', '段取-2', 
        '計画停止-2', '計画開始完了数-2', sheet_name + '-2',
        
        '総稼動時間-1', '計画投入数(Ｒ)-1', '外気温度-1', '外気湿度-1', '計画色数-1', '段取-1', 
        '計画停止-1', '計画開始完了数-1', sheet_name + '-1',
        
        'target']
    
    # add voc to data
    df['VOC燃料生成量-3'] = pd.Series(data=df_voc['VOC燃料生成量'].iloc[:-2].values, index=df.index)
    df['VOC燃料生成量-2'] = pd.Series(data=df_voc['VOC燃料生成量'].iloc[1:-1].values, index=df.index)
    df['VOC燃料生成量-1'] = pd.Series(data=df_voc['VOC燃料生成量'].iloc[2:].values, index=df.index)
    
    df['VOC再利用生成量-3'] = pd.Series(data=df_voc['VOC再利用生成量'].iloc[:-2].values, index=df.index)
    df['VOC再利用生成量-2'] = pd.Series(data=df_voc['VOC再利用生成量'].iloc[1:-1].values, index=df.index)
    df['VOC再利用生成量-1'] = pd.Series(data=df_voc['VOC再利用生成量'].iloc[2:].values, index=df.index)
    
    return df

In [18]:
holiday_path = 'D:\\Toppan\\2017-11-20 全データ\\データ\\切り離し全休日\\全休日.xlsx'

def mask_out(X, y, month):
    
    try:
        df_filter = pd.read_excel(holiday_path, sheet_name=month, index_col=0).iloc[2:]
    except Exception as e:
        print(e, month)
        return X, y
    
    seisan = True if '生産\n有無' in df_filter else False
    
    def isBusy(idx):
        row = df_filter.loc[idx]
        if row.loc['切離\n有無'] == '切離' or row.loc['全休\n判定'] == '全休' \
            or row.loc['異常判定'] == '※異常稼動' or (seisan and row.loc['生産\n有無'] == '無'):
            return False
        else:
            return True
        
    x_busy_idx = []
    y_busy_idx = []
    for x_idx, y_idx in zip (X.index, y.index):
        if isBusy(x_idx) and isBusy(y_idx):
            x_busy_idx.append(x_idx)
            y_busy_idx.append(y_idx)
    
    return X.loc[x_busy_idx], y.loc[y_busy_idx]

In [19]:
def get_importance_figure(model, name, features):
    
    indices = np.argsort(model.feature_importances_)[::-1]
    
    # save csv
    s = pd.Series(data=model.feature_importances_[indices], 
              index=features[indices])
    s.to_csv(os.path.join(out_dir, name + '_寄与度.csv'), encoding='shift-jis')

In [20]:
def split_day_night(acc_abs):
    acc_abs_days, acc_abs_nights = [], []
    for i, acc in acc_abs.iteritems():
        if 7 < i.hour < 22:
            acc_abs_days.append(acc)
        else:
            acc_abs_nights.append(acc)

    return acc_abs_days, acc_abs_nights

def get_output(res, output, sname, month):
    res = res[res['target'] != 0]
    
    if len(res) == 0:
        return None
    
    y_pred, y_true = res['preds'], res['target']
    '''calculate abs accuracy'''
    acc_abs = abs(y_pred - y_true) / y_true
    '''aplit days and nights'''
    acc_abs_days, acc_abs_nights = split_day_night(acc_abs)
    len_days, len_nights = len(acc_abs_days), len(acc_abs_nights)

    sname2acc = {'蒸気': [0.2, 0.15], '電力': [0.09, 0.15], '冷水': [0.15, 0.1]}

    '''acc stats'''
    len_acc_days = len(list(filter(lambda x: x <= sname2acc[sname][0], acc_abs_days)))
    len_acc_nights = len(list(filter(lambda x: x <= sname2acc[sname][0], acc_abs_nights)))
    acc_stats_days = len_acc_days / len_days
    acc_stats_nights = len_acc_nights / len_nights

    output['設備名'].append(month + '_' + sname)
    output['平日昼・総'].append(len_days)
    output['平日夜・総'].append(len_nights)
    output['平日昼・基準内'].append(len_acc_days)
    output['平日夜・基準内'].append(len_acc_nights)
    output['平日昼基準率'].append(acc_stats_days)
    output['平日夜基準率'].append(acc_stats_nights)

    return output

### 月データを学習

In [21]:
def get_model(mtype='et'):
    if mtype == 'rf':
        model = RandomForestRegressor(n_estimators=700, 
                                          n_jobs=-1, 
                                          max_depth=11, 
                                          max_features='auto', 
                                          criterion='mae', 
                                          random_state=700, 
                                          warm_start=True)
    else:
        model = ExtraTreesRegressor(n_estimators=700, 
                                          n_jobs=-1, 
                                          max_depth=11, 
                                          max_features='auto', 
                                          criterion='mae', 
                                          random_state=700, 
                                          warm_start=True)
    return model

In [33]:
total_acc = []

for i, (m, f) in enumerate(zip(month, month_file)):
    print(m)
    
    # the last month
    if m == '17年10月':
        continue
    
    # create output dir
    out_dir = os.path.join(base_out_dir, m)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
    # set train, trans, test files
    train_month, train_month_file = m, f
    test_month, test_month_file = month[i + 1], month_file[i + 1]
    trans_month, trans_month_file = [x for x in month if x not in [train_month, test_month]], \
                    [x for x in month_file if x not in [train_month_file, test_month_file]]
    
    print('train: ', train_month, train_month_file)
    print('test: ', test_month, test_month_file)
    print('trans: ', trans_month, trans_month_file)
    
    # base train model
    for energy in energies:

        df_train = read_df(train_month_file, train_month, energy)
                
        x_train, y_train = mask_out(df_train.drop(columns=['target']).iloc[:-1], 
                            df_train['target'].iloc[1:], train_month)
        
        # base model
        base_model = get_model()
        base_model.fit(x_train, y_train)

        # database preparation
        x_trans_all, y_trans_all = [], []
        for trans_m, trans_f in zip(trans_month, trans_month_file):
            df_trans = read_df(trans_f, trans_m, energy)
            x_trans, y_trans = mask_out(df_trans.drop(columns=['target']).iloc[:-1], 
                                        df_trans['target'].iloc[1:], trans_m)

            x_trans_all.append(x_trans)
            y_trans_all.append(y_trans)

        x_trans_all, y_trans_all = pd.concat(x_trans_all), pd.concat(y_trans_all)
        
        # pick up transfer data from data base
        start = time.time()
        x_picked, y_picked = [], []
        for idx, row in x_trans_all.iterrows():
            pred = base_model.predict(row.values.reshape(1, -1))[0]
            true = y_trans_all.loc[idx + timedelta(hours=1)]
            
            if true != 0 and np.abs(pred - true) / true <= 0.025:
                x_picked.append(row)
                y_picked.append(true)

        elapsed_time = time.time() - start
        print(train_month, energy, 'transfer elapsed time: ', elapsed_time, 's')
        print('    number of data picked: ', len(x_picked))
      
        # concat train and picked data
        x_train_trans = np.vstack((x_train, x_picked))
        y_train_trans = np.append(y_train, y_picked)
        # dump
        tt_data = np.hstack((x_train_trans, y_train_trans.reshape(-1, 1)))
        np.savetxt(os.path.join(out_dir, energy + '_学習_転移.csv'), 
                   tt_data, delimiter=',')
        
        print(len(x_train_trans))
        
        # test model
        model = get_model()
                                     
        # learn 1 hour later target
        start = time.time()
        model.fit(x_train_trans, y_train_trans)
        elapsed_time = time.time() - start

        print(train_month, energy, 'train elapsed time: ', elapsed_time, 's')

        # feature importance
        get_importance_figure(model, energy, x_train.columns)
                                     
        # test file
        df_test = read_df(test_month_file, test_month, energy)
        x_test, y_test = mask_out(df_test.drop(columns=['target']).iloc[:-1], 
                                  df_test['target'].iloc[1:], test_month)
                                     
        # test with online learning
        preds = []
        for idx, row in x_test.iterrows():

            # predict
            preds.append(model.predict(row.values.reshape(1, -1))[0])

            # online learning
            '''
            model.n_estimators += 50

            X_learn = X_learn.append(row)
            Y_learn = Y_learn.append(pd.Series(data=Y_test.loc[idx + timedelta(hours=1)], 
                      index=[idx + timedelta(hours=1)]))

            model.fit(X_learn, Y_learn)
            '''

        # save preds and test
        preds = pd.Series(data=preds, index=y_test.index, name='preds')
        result = pd.concat([preds, y_test], axis=1)
        result.to_csv(os.path.join(out_dir, energy + '.csv'), encoding='shift-jis')

        # accuracy
        output = {'設備名': [], 
                  '平日昼・総': [], '平日夜・総': [], 
                  '平日昼・基準内': [], '平日夜・基準内': [], 
                  '平日昼基準率': [], '平日夜基準率': []}
        
        output = get_output(result, output, energy, test_month)
        
        print(test_month, energy, output)

        # save accuracy
        accs = pd.DataFrame(output)
        accs.to_csv(os.path.join(out_dir, energy + '_acc.csv'), index=False, encoding='shift-jis')
        
        total_acc.append(accs)
        
total_acc = pd.concat(total_acc)
total_acc.to_csv(os.path.join(base_out_dir, 'total_acc.csv'), index=False, encoding='shift-jis')

16年11月
train:  16年11月 201611010800.xlsx
test:  16年12月 201612010800.xlsx
trans:  ['17年1月', '17年2月', '17年3月', '17年4月', '17年5月', '17年6月', '17年7月', '17年8月', '17年9月', '17年10月'] ['201701010800.xlsx', '201702010800.xlsx', '201703010800.xlsx', '201704010800.xlsx', '201705010800.xlsx', '201706010800.xlsx', '201707010800.xlsx', '201708010800.xlsx', '201709010800.xlsx', '201710010800.xlsx']
16年11月 蒸気 transfer elapsed time:  930.6326749324799 s
    number of data picked:  713
1316
16年11月 蒸気 train elapsed time:  10.251039981842041 s
16年12月 蒸気 {'設備名': ['16年12月_蒸気'], '平日昼・総': [152], '平日夜・総': [243], '平日昼・基準内': [139], '平日夜・基準内': [225], '平日昼基準率': [0.9144736842105263], '平日夜基準率': [0.9259259259259259]}
16年11月 冷水 transfer elapsed time:  923.7733235359192 s
    number of data picked:  674
1277
16年11月 冷水 train elapsed time:  10.13408374786377 s
16年12月 冷水 {'設備名': ['16年12月_冷水'], '平日昼・総': [149], '平日夜・総': [243], '平日昼・基準内': [139], '平日夜・基準内': [240], '平日昼基準率': [0.9328859060402684], '平日夜基準率': [0.9876543209876543]}
16