In [1]:
import sys
import notebookutil as nbu
sys.meta_path.append(nbu.NotebookFinder())
import datasets
import math
import os
import pandas as pd
import numpy as np
import roc_auc
import estimator_kmeans as kmeans
import estimator_knn as knn
import estimator_nn as nn
import estimator_lof as lof
import estimator_rssibased as rssie
from datetime import datetime
import json
from IPython.display import display, HTML

[ 0.70710678  0.        ]


In [2]:
import importlib
importlib.reload(nn)

<module 'estimator_nn' from 'estimator_nn.ipynb'>

In [3]:
# data loader
ds = datasets.load('data/raw/0[12]_[01][123]_0[1234]*_*')

path = data/raw/0[12]_[01][123]_0[1234]*_*
data/raw/01_01_01_4F実験室_XperiaZ3_胸ポケット_裏上_正常_まっすぐ帰宅
..............................
data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅
...x..........................
data/raw/01_01_03_4F実験室_XperiaZ3_胸ポケット_裏上_正常_5秒後まっすぐ帰宅
.........................x....
data/raw/01_01_04_4F実験室_XperiaZ3_カバン_裏上_正常_5秒後まっすぐ帰宅
.....................x........
data/raw/01_02_01_4F実験室_XperiaZ3_胸ポケット_裏上_異常_まっすぐ外出
..............................
data/raw/01_02_02_4F実験室_XperiaZ3_ズボン_裏上_異常_まっすぐ外出
..............................
data/raw/01_02_03_4F実験室_XperiaZ3_カバン_裏上_異常_まっすぐ外出
..............................
data/raw/01_03_01_4F実験室_XperiaZ3_胸ポケット_裏上_異常_まっすぐ帰宅
..............................
data/raw/01_03_02_4F実験室_XperiaZ3_ズボン_裏上_異常_まっすぐ帰宅
..............................
data/raw/01_03_03_4F実験室_XperiaZ3_カバン_裏上_異常_まっすぐ帰宅
..............................
data/raw/01_11_01_エネマネハウス_XperiaZ3_胸ポケット_裏上_正常_まっすぐ帰宅
..............................
data/raw/01_11_02_エネマネハウス_XperiaZ3_カバン_裏上_正常_まっす

In [4]:
# general estimator test
def eval_estimator(
    model,
    sensor_type = ['rssi.a','rssi.b', ['linear_accel[0]','linear_accel[1]','linear_accel[2]']],
    n_record = 3,
    base = '01_11_01',
    normal = '^01_11',
    anomaly = '^01_1[23]',
    ms_interval = 20,
    ma_window = 3,
    n_train = 3,
    normalize = True):
    
    # recalc input
    drop_interval = int(ms_interval / 20)
    
    # get data
    dfl_t = datasets.get_data(ds, title=base, before=n_record,
                              column=sensor_type, drop_interval=drop_interval)[:(n_train + 1)]
    dfl_n = datasets.get_data(ds, title=normal, before=n_record,
                              column=sensor_type, drop_interval=drop_interval)
    dfl_o = datasets.get_data(ds, title=anomaly, before=n_record,
                              column=sensor_type, drop_interval=drop_interval)
    
    # moving average
    dfl_t = datasets.moving_average(dfl_t, window=ma_window, min_periods=ma_window)
    dfl_n = datasets.moving_average(dfl_n, window=ma_window, min_periods=ma_window)
    dfl_o = datasets.moving_average(dfl_o, window=ma_window, min_periods=ma_window)

    # normalize
    if normalize == True:
        dfl_t_n = datasets.normalize_by_base_data(dfl_t, dfl_t, sensor_type)
        dfl_n_n = datasets.normalize_by_base_data(dfl_t, dfl_n, sensor_type)
        dfl_o_n = datasets.normalize_by_base_data(dfl_t, dfl_o, sensor_type)

    # get numpy array
    data_2d_t = [df.as_matrix() for df in dfl_t_n]
    data_2d_n = [df.as_matrix() for df in dfl_n_n]
    data_2d_o = [df.as_matrix() for df in dfl_o_n]

    # to list of numpy.array
    data_t = [d.ravel() for d in data_2d_t]
    data_n = [d.ravel() for d in data_2d_n]
    data_o = [d.ravel() for d in data_2d_o]
    
    # get auc score
    model.fit(data_t)
    score_n = model.decision_function(data_n)
    score_o = model.decision_function(data_o)
    auc = roc_auc.get_auc_from_normal_outlier(score_n, score_o, graph=False)
    
    return auc

#eval_estimator()

In [5]:
def read_csv(fname):
    df = pd.read_csv(fname, index_col=0)
    return df
#read_csv()

In [9]:
# grid search implementation (under test)
def _get_estimator_models():
    model_params = []
    
    # k-means based estimator
    range_n_clusters = np.arange(1, 10, 2)
    range_max_iter = np.array([3])
    mesh_data = np.meshgrid(range_n_clusters, range_max_iter)
    for n_clusters, max_iter in zip(mesh_data[0].ravel(), mesh_data[1].ravel()):
        model_params.append({'n_clusters': n_clusters, 'max_iter': max_iter, 'type': 'k-means'})
    
    # kNN based estimator
    range_n_neighbors = np.arange(1, 10, 2)
    range_algorithm = np.array(['ball_tree'])
    mesh_data = np.meshgrid(range_n_neighbors, range(len(range_algorithm)))
    for n_neighbors, algorithm_idx in zip(mesh_data[0].ravel(), mesh_data[1].ravel()):
        model_params.append({'n_neighbors': n_neighbors, 'algorithm': range_algorithm[algorithm_idx], 'type': 'kNN'})
    
    # LOF based estimator
    range_n_neighbors = np.arange(1, 10, 2)
    range_algorithm = np.array(['ball_tree'])
    mesh_data = np.meshgrid(range_n_neighbors, range(len(range_algorithm)))
    for n_neighbors, algorithm_idx in zip(mesh_data[0].ravel(), mesh_data[1].ravel()):
        model_params.append({'n_neighbors': n_neighbors, 'algorithm': range_algorithm[algorithm_idx], 'type': 'LOF'})
    
    # NN based estimator
    range_num_of_training_epochs = np.array([100])
    range_num_of_hidden_nodes = np.array([4, 8, 16, 32])
    range_size_of_mini_batch = np.array([10])
    range_size_of_test_batch = np.array([10])
    range_learning_rate = np.array([0.02])
    mesh_data = np.meshgrid(range_num_of_training_epochs, range_num_of_hidden_nodes, range_size_of_mini_batch,
                           range_size_of_test_batch, range_learning_rate)
    for num_of_training_epochs, num_of_hidden_nodes, size_of_mini_batch, size_of_test_batch, learning_rate in zip(mesh_data[0].ravel(), mesh_data[1].ravel(), mesh_data[2].ravel(), mesh_data[3].ravel(), mesh_data[4].ravel()):
        model_params.append({'num_of_training_epochs': num_of_training_epochs,
                             'num_of_hidden_nodes': num_of_hidden_nodes,
                             'size_of_mini_batch': size_of_mini_batch,
                             'size_of_test_batch': size_of_test_batch,
                             'learning_rate': learning_rate,
                             'type': 'NN'
                            })
    
    # rssi based estimator
    #models.append(rssie.EstimatorRssiBased())
    model_params.append({'type': 'rssi_based'})
    
    return model_params

def _get_grid_test_case(model_params):
    # parameters other than algorithm specific
    sensor_master_ios = [
        ['rssi.a', 'rssi.b'],
        ['rssi.a', 'rssi.b', ['acceleration.x', 'acceleration.y', 'acceleration.z']],
        ['rssi.a', 'rssi.b', ['gyro.rotationRate.x', 'gyro.rotationRate.y', 'gyro.rotationRate.z']],
        ['rssi.a', 'rssi.b', ['magneticField.x', 'magneticField.y', 'magneticField.z']],
        ['rssi.a', 'rssi.b', ['attitude.roll', 'attitude.pitch', 'attitude.yaw']],
        ['rssi.a', 'rssi.b', ['rotationRate.x', 'rotationRate.y', 'rotationRate.z']],
        ['rssi.a', 'rssi.b', ['gravity.x', 'gravity.y', 'gravity.z']],
        ['rssi.a', 'rssi.b', ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']],
    ]
    sensor_master_s = [json.dumps(s) for s in sensor_master_ios] # to be combinationable

    range_n_record = np.array([1, 5, 20])
    range_n_train = np.array([1, 5, 10, 20, 100])
    range_ms_interval = np.array([20, 60, 100])
    range_ma_window = np.arange(1, 4, 2)
    
    mesh_data = np.meshgrid(range(len(model_params)), range_n_record, range_n_train,
                            range_ms_interval, range_ma_window, range(len(sensor_master_s)))

    # create grid test case
    test_case = []
    for model_param_idx, n_record, n_train, ms_interval, ma_window, sensor_idx in zip(mesh_data[0].ravel(), mesh_data[1].ravel(), mesh_data[2].ravel(),
               mesh_data[3].ravel(), mesh_data[4].ravel(), mesh_data[5].ravel()):
        # save test case and result
        _t = {'n_record': n_record, 'n_train': n_train,
              'ms_interval': ms_interval, 'ma_window': ma_window, 'sensor_type': sensor_master_s[sensor_idx]}
        _t.update(model_params[model_param_idx])
        test_case.append(_t)
    
    # create df for test case
    df = pd.DataFrame(test_case)
    return df

def _remove_unavailable_test_case(df):
    # remove unavailable test case
    df = df[~(df['n_train'] < df['n_neighbors'])]
    df = df[~(df['n_train'] < df['n_clusters'])]
    df = df[~((df['type'] == 'rssi_based') & (df['sensor_type'].apply(lambda x: len(json.loads(x)) > 2)))]
    df = df[~(df['n_record'] - df['ma_window'] < 0)]
    return df

def _run_test(df):
    # set the data title
    base = '02_11_01'
    normal = '^02_11'
    anomaly = '^02_1[23]'
    
    csv_fname = 'test_record_%s__%s__%s.csv' % (base, normal, anomaly) 

    # if auc is already computed in some test case, merge the result. 
    if os.path.exists(csv_fname):
        df_past = read_csv(csv_fname)
        keys = list(df.columns.values)
        df = pd.merge(df, df_past, on=keys, how='outer')
    else:
        df['auc'] = np.nan
    
    # show test size
    imcomplete_test_case = len([x for x in df['auc'].values if np.isnan(x)])
    print('total test case: %d' % (len(df.index)))
    print('imcomplete test case: %d' %(imcomplete_test_case))
    
    # run test
    for i, (k, t) in enumerate(df[df['auc'].isnull()].iterrows()):
        if i % 10 == 0:
            print("%d / %d at %s"%(i+1, imcomplete_test_case, datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            df.to_csv(csv_fname )

        # generate model from parameter
        if t['type'] == 'k-means':
            model = kmeans.EstimatorKmeans(n_clusters=int(t['n_clusters']), max_iter=int(t['max_iter']))
        elif t['type'] == 'kNN':
            model = knn.EstimatorKNN(n_neighbors=int(t['n_neighbors']), algorithm=t['algorithm'])
        elif t['type'] == 'LOF':
            model = lof.EstimatorLOF(n_neighbors=int(t['n_neighbors']), algorithm=t['algorithm'])
        elif t['type'] == 'NN':
            model = nn.EstimatorNN(num_of_hidden_nodes=int(t['num_of_hidden_nodes']),
                                   num_of_training_epochs=int(t['num_of_training_epochs']),
                                   size_of_mini_batch=int(t['size_of_mini_batch']),
                                   size_of_test_batch=int(t['size_of_test_batch']),
                                   learning_rate=t['learning_rate'])
        elif t['type'] == 'rssi_based':
            model = rssie.EstimatorRssiBased()
        else:
            print('! "%s" is not defined'%(t['type']))
            return None
        
        #print(model.get_params())
        
        #  run a test
        auc = eval_estimator(model, base=base, normal=normal, anomaly=anomaly,
                             n_train=t['n_train'], sensor_type=json.loads(t['sensor_type']), n_record=t['n_record'],
                             ms_interval=t['ms_interval'], ma_window=t['ma_window'])
        df.loc[[k], 'auc'] = auc
        
        # deallocate memory space for the model
        del model
    
    df.to_csv(csv_fname )
    return df

def test():
    model_params = _get_estimator_models()
    df = _get_grid_test_case(model_params)
    df = _remove_unavailable_test_case(df)
    df = _run_test(df)
    
    return df

test()

total test case: 9315
imcomplete test case: 2265
1 / 2265 at 2017/04/28 02:03:29
11 / 2265 at 2017/04/28 02:03:51
21 / 2265 at 2017/04/28 02:04:14
31 / 2265 at 2017/04/28 02:04:36
41 / 2265 at 2017/04/28 02:04:59
51 / 2265 at 2017/04/28 02:05:22
61 / 2265 at 2017/04/28 02:05:45
71 / 2265 at 2017/04/28 02:06:09
81 / 2265 at 2017/04/28 02:06:33
91 / 2265 at 2017/04/28 02:06:56
101 / 2265 at 2017/04/28 02:07:20
111 / 2265 at 2017/04/28 02:07:42
121 / 2265 at 2017/04/28 02:08:05
131 / 2265 at 2017/04/28 02:08:27
141 / 2265 at 2017/04/28 02:08:50
151 / 2265 at 2017/04/28 02:09:13
161 / 2265 at 2017/04/28 02:09:36
171 / 2265 at 2017/04/28 02:09:59
181 / 2265 at 2017/04/28 02:10:23
191 / 2265 at 2017/04/28 02:10:46
201 / 2265 at 2017/04/28 02:11:09
211 / 2265 at 2017/04/28 02:11:31
221 / 2265 at 2017/04/28 02:11:54
231 / 2265 at 2017/04/28 02:12:17
241 / 2265 at 2017/04/28 02:12:41
251 / 2265 at 2017/04/28 02:13:04
261 / 2265 at 2017/04/28 02:13:28
271 / 2265 at 2017/04/28 02:13:51
281 / 2265

Unnamed: 0,algorithm,learning_rate,ma_window,max_iter,ms_interval,n_clusters,n_neighbors,n_record,n_train,num_of_hidden_nodes,num_of_training_epochs,sensor_type,size_of_mini_batch,size_of_test_batch,type,auc
0,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b""]",,,k-means,0.777778
1,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""acceleration.x"", ""accel...",,,k-means,0.499028
2,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""gyro.rotationRate.x"", ""...",,,k-means,0.880787
3,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""magneticField.x"", ""magn...",,,k-means,0.482176
4,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""attitude.roll"", ""attitu...",,,k-means,0.657454
5,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""rotationRate.x"", ""rotat...",,,k-means,0.886991
6,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""gravity.x"", ""gravity.y""...",,,k-means,0.414491
7,,,1,3.0,20,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""userAcceleration.x"", ""u...",,,k-means,0.986296
8,,,1,3.0,60,1.0,,1,1,,,"[""rssi.a"", ""rssi.b""]",,,k-means,0.766991
9,,,1,3.0,60,1.0,,1,1,,,"[""rssi.a"", ""rssi.b"", [""acceleration.x"", ""accel...",,,k-means,0.511528


In [10]:
import gc
gc.collect()

59884