In [1]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
in_dir = 'D:\\Toppan\\2017-11-20 全データ\\処理済(機械ごと)\\vectorized'
out_dir = in_dir

In [3]:
holiday_path = 'D:\\Toppan\\2017-11-20 全データ\\データ\\切り離し全休日\\全休日.xlsx'

def mask_out(X, y, month):
    
    try:
        df_filter = pd.read_excel(holiday_path, sheet_name=month, index_col=0).iloc[2:]
    except Exception as e:
        print(e, month)
        return X, y
    
    seisan = True if '生産\n有無' in df_filter else False
    
    def isBusy(idx):
        row = df_filter.loc[idx]
        if row.loc['切離\n有無'] == '切離' or row.loc['全休\n判定'] == '全休' \
            or row.loc['異常判定'] == '※異常稼動' or (seisan and row.loc['生産\n有無'] == '無'):
            return False
        else:
            return True
        
    x_busy_idx = []
    y_busy_idx = []
    for x_idx, y_idx in zip (X.index, y.index):
        if isBusy(x_idx) and isBusy(y_idx):
            x_busy_idx.append(x_idx)
            y_busy_idx.append(y_idx)
    
    return X.loc[x_busy_idx], y.loc[y_busy_idx]

In [4]:
def get_importance_figure(model, name, features):
    
    indices = np.argsort(model.feature_importances_)[::-1]
    
    # save csv
    s = pd.Series(data=model.feature_importances_[indices], 
                  index=features[indices])
    
    s.to_csv(os.path.join(out_dir, name + '_寄与度.csv'), 
             encoding='shift-jis')

In [5]:
def parse_data(exl, sheet):
    
    df = exl.parse(sheet_name=sheet, index_col=0)

    return df


In [6]:
def split_day_night(acc_abs):
    acc_abs_days, acc_abs_nights = [], []
    for i, acc in acc_abs.iteritems():
        if 7 < i.hour < 22:
            acc_abs_days.append(acc)
        else:
            acc_abs_nights.append(acc)

    return acc_abs_days, acc_abs_nights

def get_output(res, output, sname):
    res = res[res['target'] != 0]
    
    if len(res) == 0:
        return None
    
    y_pred, y_true = res['preds'], res['target']
    '''calculate abs accuracy'''
    acc_abs = abs(y_pred - y_true) / y_true
    '''aplit days and nights'''
    acc_abs_days, acc_abs_nights = split_day_night(acc_abs)
    len_days, len_nights = len(acc_abs_days), len(acc_abs_nights)

    #sname2acc = {'蒸気': [0.2, 0.15], '電力': [0.09, 0.15], '冷水': [0.15, 0.1]}

    '''acc stats'''
    len_acc_days = len(list(filter(lambda x: x <= 0.2, acc_abs_days)))
    len_acc_nights = len(list(filter(lambda x: x <= 0.15, acc_abs_nights)))
    acc_stats_days = len_acc_days / len_days
    acc_stats_nights = len_acc_nights / len_nights

    output['設備名'].append(sname)
    output['平日昼・総'].append(len_days)
    output['平日夜・総'].append(len_nights)
    output['平日昼・基準内'].append(len_acc_days)
    output['平日夜・基準内'].append(len_acc_nights)
    output['平日昼基準率'].append(acc_stats_days)
    output['平日夜基準率'].append(acc_stats_nights)

    return output

### Learning

In [7]:
exl_learn = pd.ExcelFile(os.path.join(in_dir, '201709010800_vapor_per_machine.xlsx'))
exl_test = pd.ExcelFile(os.path.join(in_dir, '201710010800_vapor_per_machine.xlsx'))

accs = []
for sheet in exl_learn.sheet_names:
    
    if sheet == 'GDNA': continue
    
    # data
    df_learn = parse_data(exl_learn, sheet)
    df_test = parse_data(exl_test, sheet)

    # filter out holidays
    X_learn, y_learn = mask_out(df_learn.iloc[:-1, :-1], df_learn.iloc[1:, -1], '17年9月')
    X_test, y_test = mask_out(df_test.iloc[:-1, :-1], df_test.iloc[1:, -1], '17年10月')
    
    # base learner
    model = RandomForestRegressor(n_estimators=700, 
                                  n_jobs=-1, 
                                  max_depth=11, 
                                  max_features='auto', 
                                  criterion='mae', 
                                  random_state=700, 
                                  warm_start=True)
    
    # learn 1 hour later target
    model.fit(X_learn.values, y_learn.values)
    
    # get feature importance figures
    #get_importance_figure(model, sheet)
    
    # test with online learning
    preds = []
    for idx, row in X_test.iterrows():
        
        # predict
        preds.append(model.predict(row.values.reshape(1, -1))[0])
        
        # online learning
        model.n_estimators += 50
        
        X_learn = X_learn.append(row)
        y_learn = y_learn.append(pd.Series(data=y_test.loc[idx + timedelta(hours=1)], 
                  index=[idx + timedelta(hours=1)]))
        
        model.fit(X_learn, y_learn)
        
    # save preds and test
    preds = pd.Series(data=preds, index=y_test.index, name='preds')
    result = pd.concat([preds, y_test], axis=1)
    result.to_csv(os.path.join(out_dir, sheet + '.csv'))
    
    # accuracy
    output = {'設備名': [], 
              '平日昼・総': [], '平日夜・総': [], 
              '平日昼・基準内': [], '平日夜・基準内': [], 
              '平日昼基準率': [], '平日夜基準率': []}
    output = get_output(result, output, sheet)
    
    print(sheet, output)
    
    if output:
        accs.append(pd.DataFrame(output))
    
# save accuracy
accs = pd.concat(accs)
accs.to_csv(os.path.join(out_dir, 'acc.csv'), index=False, encoding='shift-jis')
    

GDNB {'設備名': ['GDNB'], '平日昼・総': [290], '平日夜・総': [202], '平日昼・基準内': [107], '平日夜・基準内': [56], '平日昼基準率': [0.3689655172413793], '平日夜基準率': [0.27722772277227725]}
GE07 {'設備名': ['GE07'], '平日昼・総': [292], '平日夜・総': [227], '平日昼・基準内': [125], '平日夜・基準内': [53], '平日昼基準率': [0.4280821917808219], '平日夜基準率': [0.23348017621145375]}
GE51 {'設備名': ['GE51'], '平日昼・総': [299], '平日夜・総': [234], '平日昼・基準内': [125], '平日夜・基準内': [67], '平日昼基準率': [0.4180602006688963], '平日夜基準率': [0.2863247863247863]}
GE52 {'設備名': ['GE52'], '平日昼・総': [124], '平日夜・総': [124], '平日昼・基準内': [56], '平日夜・基準内': [42], '平日昼基準率': [0.45161290322580644], '平日夜基準率': [0.3387096774193548]}
GE53 {'設備名': ['GE53'], '平日昼・総': [294], '平日夜・総': [215], '平日昼・基準内': [189], '平日夜・基準内': [137], '平日昼基準率': [0.6428571428571429], '平日夜基準率': [0.6372093023255814]}
GL15 {'設備名': ['GL15'], '平日昼・総': [341], '平日夜・総': [266], '平日昼・基準内': [304], '平日夜・基準内': [242], '平日昼基準率': [0.8914956011730205], '平日夜基準率': [0.9097744360902256]}
GL51 {'設備名': ['GL51'], '平日昼・総': [340], '平日夜・総': [266], '平日昼・基準内': [328],

In [8]:
print('-------------over-----------')

-------------over-----------
