In [51]:
# D:\Toppan\jupyter\per machine

import pandas as pd
import os
import numpy as np
from datetime import timedelta
import time
import glob

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

in_dir = 'D:\\Toppan\\2017-11-20 全データ\\処理済(機械ごと)\\vapor\\with_voc_vectorized'
base_out_dir = 'D:\\Toppan\\2017-11-20 全データ\\解析結果(機械ごと)\\年間モデル_追加学習なし_特徴量カット'

# train and test months
month = [ '16年11月', '16年12月', '17年1月', '17年2月', '17年3月', '17年4月',
         '17年5月', '17年6月', '17年7月', '17年8月', '17年9月', '17年10月', 
         '17年11月', '17年12月', '18年1月', '18年2月']

month_file = [f for f in os.listdir(in_dir)]

In [52]:
holiday_path = 'D:\\Toppan\\2017-11-20 全データ\\データ\\切り離し全休日\\全休日.xlsx'

def mask_out(X, y, month):
    
    try:
        df_filter = pd.read_excel(holiday_path, sheet_name=month, index_col=0).iloc[2:]
        seisan = True if '生産\n有無' in df_filter else False
        
    except Exception as e:
        print(e, month)
        return X, y
    
    def isBusy(idx):
        row = df_filter.loc[idx]

        if row.loc['切離\n有無'] == '切離' or row.loc['全休\n判定'] == '全休' \
            or row.loc['異常判定'] == '※異常稼動' or row.loc['異常判定'] == '※データ異常' \
            or (seisan and row.loc['生産\n有無'] == '無'):
            return False
        else:
            return True

    x_busy_idx = []
    y_busy_idx = []
    for x_idx, y_idx in zip (X.index, y.index):
        if isBusy(x_idx) and isBusy(y_idx):
            x_busy_idx.append(x_idx)
            y_busy_idx.append(y_idx)

    return X.loc[x_busy_idx], y.loc[y_busy_idx]

In [53]:
def get_importance_figure(model, name, features):
    
    indices = np.argsort(model.feature_importances_)[::-1]
    
    # save csv
    s = pd.Series(data=model.feature_importances_[indices], 
              index=features[indices])
    s.to_csv(os.path.join(out_dir, name + '_寄与度.csv'), encoding='shift-jis')

In [54]:
def split_day_night(acc_abs):
    acc_abs_days, acc_abs_nights = [], []
    for i, acc in acc_abs.iteritems():
        if 7 < i.hour < 22:
            acc_abs_days.append(acc)
        else:
            acc_abs_nights.append(acc)

    return acc_abs_days, acc_abs_nights

def get_output(res, output, sname, month):
    res = res[res['target'] != 0]
    
    if len(res) == 0:
        return None
    
    y_pred, y_true = res['preds'], res['target']
    '''calculate abs accuracy'''
    acc_abs = abs(y_pred - y_true) / y_true
    '''aplit days and nights'''
    acc_abs_days, acc_abs_nights = split_day_night(acc_abs)
    len_days, len_nights = len(acc_abs_days), len(acc_abs_nights)

    #sname2acc = {'蒸気': [0.2, 0.15], '電力': [0.09, 0.15], '冷水': [0.15, 0.1]}

    '''acc stats'''
    #len_acc_days = len(list(filter(lambda x: x <= sname2acc[sname][0], acc_abs_days)))
    #len_acc_nights = len(list(filter(lambda x: x <= sname2acc[sname][0], acc_abs_nights)))
    len_acc_days = len(list(filter(lambda x: x <= 0.2, acc_abs_days)))
    len_acc_nights = len(list(filter(lambda x: x <= 0.15, acc_abs_nights)))
    acc_stats_days = len_acc_days / len_days
    acc_stats_nights = len_acc_nights / len_nights

    output['設備名'].append(month + '_' + sname)
    output['平日昼・総'].append(len_days)
    output['平日夜・総'].append(len_nights)
    output['平日昼・基準内'].append(len_acc_days)
    output['平日夜・基準内'].append(len_acc_nights)
    output['平日昼基準率'].append(acc_stats_days)
    output['平日夜基準率'].append(acc_stats_nights)

    return output

### 年間データを Leave-One-Out で学習・予測

In [55]:
total_acc = []

unimportant_features = ['調整-1', '調整-2', '調整-3', 
                        '計画停止-1', '計画停止-2', '計画停止-3']

for m, f in zip(month, month_file):
    
    print(m, f)
    
    # create output dir
    out_dir = os.path.join(base_out_dir, m)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # set train and test files
    test_month, test_month_file = m, f
    train_month, train_month_file = [x for x in month if x != m], \
                                            [x for x in month_file if x != f]
    
    print('  train files: ', train_month_file)
    print('  test file: ', test_month_file)
    
    exl_test = pd.ExcelFile(os.path.join(in_dir, f))
    for energy in exl_test.sheet_names:
        
        print('    now processing: ', energy)
        
        # learning files
        X_learn, Y_learn = [], []
        for f, m in zip(train_month_file, train_month):
            
            df_train = pd.read_excel(os.path.join(in_dir, f), 
                          sheet_name=energy, 
                          index_col=0, 
                          parse_dates=True, 
                          encoding='shift-jis')
            
            # 寄与度が低い特徴量をカット
            df_train.drop(columns=unimportant_features, inplace=True)
            
            x, y = mask_out(df_train.drop(columns=['target']).iloc[:-1], 
                            df_train['target'].iloc[1:], m)

            X_learn.append(x)
            Y_learn.append(y)

        X_learn, Y_learn = pd.concat(X_learn), pd.concat(Y_learn)
        print('      X_learn, Y_leran shapes:', X_learn.shape, Y_learn.shape)
        
        # test file
        df_test = exl_test.parse(sheet_name=energy, 
                            index_col=0, 
                            parse_dates=True, 
                            encoding='shift-jis')
        
        # 寄与度が低い特徴量をカット
        df_test.drop(columns=unimportant_features, inplace=True)
        
        X_test, Y_test = mask_out(df_test.drop(columns=['target']).iloc[:-1], 
                                  df_test['target'].iloc[1:], test_month)
        print('      X_test, Y_test shapes:', X_test.shape, Y_test.shape)
        
        # fill out nan
        X_learn_nan = pd.isnull(X_learn).any(1).nonzero()[0]
        X_test_nan = pd.isnull(X_test).any(1).nonzero()[0]
        print('          number of nan in train and test data: ', 
              len(X_learn_nan), len(X_test_nan))
        
        X_learn, Y_learn = X_learn.fillna(0), Y_learn.fillna(0)
        X_test, Y_test = X_test.fillna(0), Y_test.fillna(0)
        
        # model
        model = ExtraTreesRegressor(n_estimators=700, 
                                      n_jobs=-1, 
                                      max_depth=11, 
                                      max_features='auto', 
                                      criterion='mae', 
                                      random_state=700, 
                                      warm_start=True)
        
        # learn 1 hour later target
        start = time.time()
        model.fit(X_learn, Y_learn)
        elapsed_time = time.time() - start

        print(test_month, energy, 'learn elapsed time: ', elapsed_time, 's')

        # feature importance
        get_importance_figure(model, energy, X_test.columns)
        
        # test with online learning
        preds = []
        for idx, row in X_test.iterrows():

            # predict
            preds.append(model.predict(row.values.reshape(1, -1))[0])

            # online learning
            '''
            model.n_estimators += 50

            X_learn = X_learn.append(row)
            Y_learn = Y_learn.append(pd.Series(data=Y_test.loc[idx + timedelta(hours=1)], 
                      index=[idx + timedelta(hours=1)]))

            model.fit(X_learn, Y_learn)
            '''

        # save preds and test
        preds = pd.Series(data=preds, index=Y_test.index, name='preds')
        result = pd.concat([preds, Y_test], axis=1)
        result.to_csv(os.path.join(out_dir, energy + '.csv'), encoding='shift-jis')

        # accuracy
        output = {'設備名': [], 
                  '平日昼・総': [], '平日夜・総': [], 
                  '平日昼・基準内': [], '平日夜・基準内': [], 
                  '平日昼基準率': [], '平日夜基準率': []}
        
        output = get_output(result, output, energy, test_month)
        
        print(test_month, energy, output)

        # save accuracy
        if output:
            accs = pd.DataFrame(output)
            accs.to_csv(os.path.join(out_dir, energy + '_acc.csv'), index=False, encoding='shift-jis')
            total_acc.append(accs)

total_acc = pd.concat(total_acc)
total_acc.to_csv(os.path.join(base_out_dir, 'total_acc.csv'), index=False, encoding='shift-jis')

16年11月 201611010800_vapor_per_machine.xlsx
  train files:  ['201612010800_vapor_per_machine.xlsx', '201701010800_vapor_per_machine.xlsx', '201702010800_vapor_per_machine.xlsx', '201703010800_vapor_per_machine.xlsx', '201704010800_vapor_per_machine.xlsx', '201705010800_vapor_per_machine.xlsx', '201706010800_vapor_per_machine.xlsx', '201707010800_vapor_per_machine.xlsx', '201708010800_vapor_per_machine.xlsx', '201709010800_vapor_per_machine.xlsx', '201710010800_vapor_per_machine.xlsx', '201711010800_vapor_per_machine.xlsx', '201712010800_vapor_per_machine.xlsx', '201801010800_vapor_per_machine.xlsx', '201802010800_vapor_per_machine.xlsx', 'old']
  test file:  201611010800_vapor_per_machine.xlsx
    now processing:  GDNA
      X_learn, Y_leran shapes: (8485, 30) (8485,)
      X_test, Y_test shapes: (603, 30) (603,)
          number of nan in train and test data:  12 0
16年11月 GDNA learn elapsed time:  294.7741320133209 s
16年11月 GDNA None
    now processing:  GDNB
      X_learn, Y_leran sha

ValueError: labels ['調整-1' '調整-2' '調整-3' '計画停止-1' '計画停止-2' '計画停止-3'] not contained in axis

### Test

In [None]:
any(X_learn.isnull())

In [None]:
any(Y_learn.isnull())

In [None]:
X_learn

In [None]:
X_learn.to_csv(os.path.join(out_dir, 'x_learn.csv'), encoding='shift-jis')

In [None]:
X_learn=X_learn.fillna(0)
pd.isnull(X_learn).any(1).nonzero()[0]