In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## Preparing data

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

# loading data
# @param:
#     path - the directory under which the numpy files are 

def load_data(path):
    X = np.load(path+'full_data.npy')
    Y = np.load(path+'pollution_data.npy')
    return X, Y

def iscontaminated(e, n_steps):
    for j in range(len(e[0])-n_steps+1):
        for i in range(len(e)):
            if e[i][j] > 0:
                return j
    
    return -1

# spliting data into training and test sets and re-organize data format according to given settings
# @param:
#     X - raw sensor data
#     Y - one-hot labels
#     n_sensors    - the number of sensors deployed in the WDS
#     n_steps      - the number of report steps used for DNN model from when the contamination is detected
#     random_state - the seed for random split
#     test_size    - the ratio of test set

def split_data(X, Y, n_sensors=10, n_steps=10, random_state=0, test_size = 0.2):
    
    SIZE = len(X)
    STEPS = len(X[0][0])
    
    # extract data only from sensor nodes
    sensor_index = [10 * i for i in range(n_sensors)]
    sensor_data = np.tile(-1, (SIZE, n_sensors, STEPS))
    for i in range(SIZE):
        sensor_data[i] = X[i][[sensor_index]]
        
    # filter undetected events 
    cnt = 0
    start_index = np.zeros(SIZE).astype(np.int)
    for i in range(SIZE):
        ind = iscontaminated(sensor_data[i], n_steps)
        if ind >= 0:
            cnt += 1
        start_index[i] = ind

    # extract data within limited time
    limited_sensor_data = np.tile(-1, (cnt, n_sensors, n_steps))
    limited_pollution_data = np.tile(-1,(cnt, 92))
    
    j = 0
    for i in range(SIZE):
        if start_index[i] >= 0:
        
            limited_sensor_data[j] = sensor_data[i,0:n_sensors,start_index[i]:start_index[i]+n_steps]
            limited_pollution_data[j] = Y[i]
            j += 1
            
    print(limited_sensor_data.shape)
    
    #split data into train set and test set
    x_train, x_test, y_train, y_test = train_test_split(limited_sensor_data, limited_pollution_data, random_state=random_state, test_size=test_size)
    
    return x_train, x_test, y_train, y_test

In [3]:
def generate_data(x, y, batch_size):

    c = 0
    while True:
        
        if c > len(x) - batch_size:
            c = 0

        batch_x = x[c:c+batch_size]
        batch_x = np.reshape(batch_x, (batch_x.shape[0], batch_x.shape[1], batch_x.shape[2], 1))
        batch_y = y[c:c+batch_size]

        yield batch_x, batch_y
        c += batch_size

# categorical_crossentropy

In [4]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint 
from keras import backend as K
from keras.models import load_model
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

x, y = load_data("./wds_data/data_10000/")

In [5]:
results = {}
for N_S in [4,6]:
    for N_T in [6,12]:
        x_train_valid, x_test, y_train_valid, y_test = split_data(x, y, N_S, N_T)

        x_train, x_valid, y_train, y_valid = train_test_split(x_train_valid, y_train_valid, random_state=0, test_size=0.25)
        print(x_train.shape, x_valid.shape, x_test.shape, y_train.shape, y_valid.shape, y_test.shape)

        # ===========================

        batch_size = 128
        epochs = 120
        steps_per_epoch = len(x_train) / batch_size 

        # input data dimensions
        rows, cols = len(x_train[0]), len(x_train[0][0])

        input_shape = (rows, cols, 1)
        print('input_shape: ', input_shape)

        num_classes = y_train.shape[1]
        print('num_classes: ', num_classes)

        train_generator = generate_data(x_train, y_train, batch_size)
        validate_generator = generate_data(x_valid, y_valid, batch_size)
        evaluate_generator = generate_data(x_test, y_test, batch_size)
        model = Sequential()
        model.add(Conv2D(16, kernel_size=(1, 2),
                         activation='relu', padding='same', 
                         input_shape=input_shape))
        model.add(Conv2D(32, (1, 2), activation='relu', padding='same'))
        model.add(Dropout(0.25))
        model.add(Conv2D(64, (1, 2), activation='relu', padding='same'))
        model.add(Dropout(0.25))
        model.add(Conv2D(128, (1, 2), activation='relu', padding='same'))
        model.add(Dropout(0.25))

        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(num_classes, activation='softmax'))

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adam(),
                      metrics=['accuracy'])

        # checkpoint
        filepath='./trained_models/CSI_cnn_best_'+str(N_S)+'-'+str(N_T)+'.hdf5'
        checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
        callbacks_list = [checkpoint]
        
        model.fit_generator(train_generator, 
                    epochs=epochs,
                    steps_per_epoch=steps_per_epoch,
                    verbose=1,
                    validation_data=validate_generator, 
                    callbacks = callbacks_list,
                    validation_steps=100)
        model = load_model('./trained_models/CNN_trained_models/CSI_cnn_best_'+str(N_S)+'-'+str(N_T)+'.hdf5')
        loss, accuracy = model.evaluate_generator( evaluate_generator, len(y_test)/batch_size )
        print ("Accuracy on test set is", accuracy)
        evaluator = generate_data(x_test, y_test, len(y_test))
        predictions = model.predict_generator(evaluator, 1)
        n = 5
        topn = np.zeros(n)
        cntn = np.zeros(n)
        for i in range(len(predictions)):
            for j in range(n):
                topn[j] = np.argpartition(predictions[i], -(j+1))[-(j+1):][0]
            for j in range(n):
                if topn[j] == np.argmax(y_test[i]):
                    cntn[j] += 1
                    break
        print(cntn / len(predictions))
        results[(N_S, N_T, 1)] = cntn[0] / len(predictions)
        print(sum(cntn) / len(predictions))
        results[(N_S, N_T, 5)] = sum(cntn) / len(predictions)
        print('results: ', results)