# **DevNet**

## **라이브러리 및 데이터 불러오기**

In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

from keras import regularizers
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, TensorBoard

import argparse
import numpy as np
import matplotlib.pyplot as plt
import sys
from scipy.sparse import vstack, csc_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve, precision_recall_curve, average_precision_score, roc_auc_score
from sklearn import preprocessing
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import LabelEncoder
import time

MAX_INT = np.iinfo(np.int32).max
data_format = 0

In [2]:
tf.config.run_functions_eagerly(True)

In [3]:
train=pd.read_csv('../train.csv')
test=pd.read_csv('../test.csv')
train=train.drop('FRST_RCV_DT', axis=1)
test=test.drop('FRST_RCV_DT', axis=1)
target='FDS_CATH'

In [4]:
nunique = train.nunique()
types = train.dtypes

for col in train.columns:
    if types[col] == 'object' or nunique[col] <200:
        l_enc= LabelEncoder()
        train[col]= train[col].fillna("VV_likely")
        train[col]= l_enc.fit_transform(train[col].values)
        test[col]=l_enc.transform(test[col].values)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)
        

In [6]:
train_x= np.array(train.iloc[:, :-1], np.float32)
train_y= np.array(train.iloc[:,-1], np.float32)
test_x = np.array(test.iloc[:, :-1], np.float32)
test_y = np.array(test.iloc[:, -1], np.float32)

## **DevNet 관련 함수 정의**

In [7]:
def aucPerformance(mse, labels):
    roc_auc = roc_auc_score(labels, mse)
    ap = average_precision_score(labels, mse)
    return roc_auc, ap

In [8]:
def dev_network_d(input_shape):
    '''
    deeper network architecture with three hidden layers
    '''
    x_input = Input(shape=input_shape)
    intermediate = Dense(1000, activation='relu',
                kernel_regularizer=regularizers.l2(0.01), name = 'hl1')(x_input)
    intermediate = Dense(250, activation='relu',
                kernel_regularizer=regularizers.l2(0.01), name = 'hl2')(intermediate)
    intermediate = Dense(20, activation='relu',
                kernel_regularizer=regularizers.l2(0.01), name = 'hl3')(intermediate)
    intermediate = Dense(1, activation='linear', name = 'score')(intermediate)
    return Model(x_input, intermediate)


def dev_network_s(input_shape):
    '''
    network architecture with one hidden layer
    '''
    x_input = Input(shape=input_shape)
    intermediate = Dense(20, activation='relu', 
                kernel_regularizer=regularizers.l2(0.01), name = 'hl1')(x_input)
    intermediate = Dense(1, activation='linear',  name = 'score')(intermediate)    
    return Model(x_input, intermediate)

def dev_network_linear(input_shape):
    '''
    network architecture with no hidden layer, equivalent to linear mapping from
    raw inputs to anomaly scores
    '''    
    x_input = Input(shape=input_shape)
    intermediate = Dense(1, activation='linear',  name = 'score')(x_input)
    return Model(x_input, intermediate)


def deviation_loss(y_true, y_pred, ref=None):
    '''
    z-score-based deviation loss
    '''    
    confidence_margin = 5.     
    ## size=5000 is the setting of l in algorithm 1 in the paper
    if ref is None:
        ref = K.variable(np.random.normal(loc = 0., scale= 1.0, size = 5000), dtype='float32')
    #ref= np.random.normal(loc = 0., scale= 1.0, size = 5000)
    dev = (y_pred - K.mean(ref)) / K.std(ref)
    inlier_loss = K.abs(dev) 
    outlier_loss = K.abs(K.maximum(confidence_margin - dev, 0.))
    return K.mean((1 - y_true) * inlier_loss + y_true * outlier_loss)


def deviation_network(input_shape, network_depth):
    '''
    construct the deviation network-based detection model
    '''
    if network_depth == 4:
        model = dev_network_d(input_shape)
    elif network_depth == 2:
        model = dev_network_s(input_shape)
    elif network_depth == 1:
        model = dev_network_linear(input_shape)
    else:
        sys.exit("The network depth is not set properly")
    rms = RMSprop(clipnorm=1.)
    model.compile(loss=deviation_loss, optimizer=rms)
    return model

In [9]:
def input_batch_generation_sup(x_train, outlier_indices, inlier_indices, batch_size, rng):
    '''
    batchs of samples. This is for csv data.
    Alternates between positive and negative pairs.
    '''      
    dim = x_train.shape[1]
    ref = np.empty((batch_size, dim))    
    training_labels = []
    n_inliers = len(inlier_indices)
    n_outliers = len(outlier_indices)
    for i in range(batch_size):    
        if(i % 2 == 0):
            sid = rng.choice(n_inliers, 1)
            ref[i] = x_train[inlier_indices[sid]]
            training_labels += [0]
        else:
            sid = rng.choice(n_outliers, 1)
            ref[i] = x_train[outlier_indices[sid]]
            training_labels += [1]
    return np.array(ref, dtype='float32'), np.array(training_labels, dtype='float32')


def batch_generator_sup(x, outlier_indices, inlier_indices, batch_size, nb_batch, rng):
    """batch generator
    """
    rng = np.random.RandomState(rng.randint(MAX_INT, size = 1))
    counter = 0
    while 1:                
        
        ref, training_labels = input_batch_generation_sup(x, outlier_indices, inlier_indices, batch_size, rng)
        counter += 1
        yield(ref, training_labels)
        if (counter > nb_batch):
            counter = 0

In [10]:
def load_model_weight_predict(model_name, input_shape, network_depth, x_test):
    '''
    load the saved weights to make predictions
    '''
    model = deviation_network(input_shape, network_depth)
    model.load_weights(model_name)
    scoring_network = Model(inputs=model.input, outputs=model.output)    
    scores = scoring_network.predict(x_test)

    return scores

In [11]:
def inject_noise(seed, n_out, random_seed):   
    '''
    add anomalies to training data to replicate anomaly contaminated data sets.
    we randomly swape 5% features of anomalies to avoid duplicate contaminated anomalies.
    this is for dense data
    '''  
    rng = np.random.RandomState(random_seed) 
    n_sample, dim = seed.shape
    swap_ratio = 0.05
    n_swap_feat = int(swap_ratio * dim)
    noise = np.empty((n_out, dim))
    for i in np.arange(n_out):
        outlier_idx = rng.choice(n_sample, 2, replace = False)
        o1 = seed[outlier_idx[0]]
        o2 = seed[outlier_idx[1]]
        swap_feats = rng.choice(dim, n_swap_feat, replace = False)
        noise[i] = o1.copy()
        noise[i, swap_feats] = o2[swap_feats]
    return noise

## **Modeling not noise**

In [None]:
network_depth = 2
random_seed =42

runs = 5#args.runs
rauc = np.zeros(runs)
ap = np.zeros(runs)
global data_format
data_format = int(0)
x = train_x
labels = train_y
outlier_indices = np.where(labels==1)[0]
outliers = x[outlier_indices]
n_outliers_org = outliers.shape[0]

train_time=0
test_time=0
for i in np.arange(runs):
    x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=42, stratify=labels)
    outlier_indices = np.where(y_train ==1)[0]
    inlier_indices = np.where(y_train ==0)[0]
    n_outliers = len(outlier_indices)

    cont_rate =0.02
    known_outliers =30
    n_noise  = len(np.where(y_train == 0)[0]) * cont_rate / (1. - cont_rate)
    n_noise = int(n_noise)
    rng = np.random.RandomState(random_seed)
    
    print(y_train.shape[0], outlier_indices.shape[0], inlier_indices.shape[0], n_noise)
    input_shape = x_train.shape[1:]
    n_samples_trn = x_train.shape[0]
    n_outliers = len(outlier_indices)
    
    start_time = time.time()
    input_shape = x_train.shape[1:]
    epochs=20
    batch_size =1024
    nb_batch = 128
    model = deviation_network(input_shape, network_depth)
    print(model.summary())
    #np.float32
    

    model_name = "./model/devnet_"+ str(cont_rate) + "cr_"  + str(batch_size) +"bs_" + str(known_outliers) + "ko_" + str(network_depth) +"d.h5"
    checkpointer = ModelCheckpoint(model_name, monitor='loss', verbose=0, save_best_only=True, save_weights_only=True)
    
    model.fit_generator(batch_generator_sup(x_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng), 
                       steps_per_epoch = nb_batch, epochs=epochs, callbacks=[checkpointer])
    
    train_time +=time.time() - start_time
    
    start_time = time.time()
    scores = load_model_weight_predict(model_name, input_shape, network_depth, x_test)
    test_time += time.time()-start_time
    rauc[i], ap[i] = aucPerformance(scores, y_test)

mean_auc=np.mean(rauc)
mean_aucpr=np.mean(ap)

print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr))   

## **결과 도출**

In [209]:
model = deviation_network(input_shape, network_depth)
model.load_weights(model_name)
scoring_network = Model(inputs=model.input, outputs=model.output)    
scores = scoring_network.predict(x_test)

  "Even though the tf.config.experimental_run_functions_eagerly "


In [210]:
test_scores = scoring_network.predict(test_x)
test_scores = test_scores.reshape(-1)
train_scores = scoring_network.predict(train_x)
train_scores = train_scores.reshape(-1)

In [211]:
def get_result_devnet(labels, scores, quantile_num=0.97):
    
    prediction=np.where(scores>np.quantile(scores, quantile_num), 1,0)
    score = pd.DataFrame({'labels':labels, 'scores':scores, 'prediction':prediction})
    score.sort_values('scores', ascending=False, inplace=True)
    score=score.reset_index().drop('index', axis=1).copy()

    n_group=10
    group=['G01','G02','G03','G04','G05','G06','G07','G08','G09','G10']
    bins=round(len(score)/n_group)
    bins_1=len(score)-bins*(n_group-1)

    group_list=[]
    for i in range(n_group):
        if i <n_group-1:
            group_list.append(np.repeat(group[i], bins).tolist())
        else:
            group_list.append(np.repeat(group[i], bins_1).tolist())
    
    import itertools
    group_list=list(itertools.chain(*group_list))
    score['group']=pd.Series(group_list)

    function_list=['count','sum','mean']
    table=score.groupby(['group']).agg(function_list).reset_index()
    
    return score, table

In [212]:
test_score, test_table = get_result_devnet(test_y, test_scores)
train_score, train_table = get_result_devnet(train_y, train_scores)

In [213]:
test_table

Unnamed: 0_level_0,group,labels,labels,labels,scores,scores,scores,prediction,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,1766,102.0,0.057758,1766,8910.638672,5.045662,1766,528,0.298981
1,G02,1766,44.0,0.024915,1766,7506.61377,4.250631,1766,0,0.0
2,G03,1766,20.0,0.011325,1766,5038.678711,2.853159,1766,0,0.0
3,G04,1766,10.0,0.005663,1766,1263.39502,0.715399,1766,0,0.0
4,G05,1766,12.0,0.006795,1766,762.505249,0.43177,1766,0,0.0
5,G06,1766,12.0,0.006795,1766,422.158295,0.239048,1766,0,0.0
6,G07,1766,12.0,0.006795,1766,154.978287,0.087757,1766,0,0.0
7,G08,1766,8.0,0.00453,1766,17.722429,0.010035,1766,0,0.0
8,G09,1766,4.0,0.002265,1766,-67.78923,-0.038386,1766,0,0.0
9,G10,1767,5.0,0.00283,1767,-184.328552,-0.104317,1767,0,0.0


In [214]:
train_table

Unnamed: 0_level_0,group,labels,labels,labels,scores,scores,scores,prediction,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,11889,1191.0,0.100177,11889,60257.660156,5.068354,11889,3564,0.299773
1,G02,11889,560.0,0.047102,11889,50523.625,4.249611,11889,0,0.0
2,G03,11889,445.0,0.03743,11889,31556.234375,2.654238,11889,0,0.0
3,G04,11889,307.0,0.025822,11889,8161.632812,0.686486,11889,0,0.0
4,G05,11889,281.0,0.023635,11889,5115.375977,0.430261,11889,0,0.0
5,G06,11889,272.0,0.022878,11889,2956.444824,0.248671,11889,0,0.0
6,G07,11889,229.0,0.019262,11889,1182.906982,0.099496,11889,0,0.0
7,G08,11889,158.0,0.01329,11889,114.551361,0.009635,11889,0,0.0
8,G09,11889,85.0,0.007149,11889,-482.399323,-0.040575,11889,0,0.0
9,G10,11887,61.0,0.005132,11887,-1223.88623,-0.10296,11887,0,0.0


## **Modeling with noise**

In [215]:
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

(118888, 21) (118888,) (17661, 21) (17661,)


In [None]:
network_depth = 2
random_seed =42

runs = 5 #args.runs
rauc = np.zeros(runs)
ap = np.zeros(runs)
global data_format
data_format = int(0)

x = train_x
labels = train_y
outlier_indices = np.where(labels==1)[0]
outliers = x[outlier_indices]
n_outliers_org = outliers.shape[0] #outlier 갯수

train_time=0
test_time=0
for i in np.arange(runs):
    x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=42, stratify=labels)
    outlier_indices = np.where(y_train ==1)[0]
    inlier_indices = np.where(y_train ==0)[0]
    n_outliers = len(outlier_indices)
    print("Original_trainig size :%d, No. outliers: %d" %(x_train.shape[0], n_outliers))
    
    cont_rate =0.02
    known_outliers =3000 #known outlier의 의미를 알아야 함
    #n_noise cont_rate에 따라 달라짐
    n_noise  = len(np.where(y_train == 0)[0]) * cont_rate / (1. - cont_rate)
    n_noise = int(n_noise)
    rng = np.random.RandomState(random_seed)
    
    #n_outliers가 known_outliers 보다 크면 제거 => known_outlier에 맞춤
    
    if n_outliers > known_outliers:
        mn = n_outliers -known_outliers
        remove_idx = rng.choice(outlier_indices, mn, replace=False)
        x_train = np.delete(x_train, remove_idx, axis=0)
        y_train = np.delete(y_train, remove_idx, axis=0)
    
    noises = inject_noise(outliers, n_noise, random_seed)
    x_train = np.append(x_train, noises, axis=0)
    y_train = np.append(y_train, np.zeros((noises.shape[0], 1)))
    
    outlier_indices = np.where(y_train==1)[0]
    inlier_indices = np.where(y_train ==0)[0]
    print("y_train 갯수 :%d, outlier 갯수 :%d, inlier 갯수:%d, n_noise : %d"%(y_train.shape[0], outlier_indices.shape[0], inlier_indices.shape[0], n_noise))
    input_shape = x_train.shape[1:]
    n_samples_trn = x_train.shape[0]
    n_outliers = len(outlier_indices)
    print("Training data size : %d, No. outliers : %d"%(x_train.shape[0], n_outliers))
    
    
    ###validation data를 만들어보자
    outlier_indices_test = np.where(y_test==1)[0]
    inlier_indices_test = np.where(y_test==0)[0]
    
    input_shape = x_train.shape[1:]
    epochs=20
    batch_size =1024
    nb_batch = 128
    model = deviation_network(input_shape, network_depth)
    print(model.summary())
    #np.float32
    

    model_name = "./model/devnet_noise"+ str(cont_rate) + "cr_"  + str(batch_size) +"bs_" + str(known_outliers) + "ko_" + str(network_depth) +"d.h5"
    checkpointer = ModelCheckpoint(model_name, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True)
    
    model.fit_generator(batch_generator_sup(x_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng),
                        steps_per_epoch = nb_batch,
                        validation_data=batch_generator_sup(x_test, outlier_indices_test, inlier_indices_test, batch_size, nb_batch, rng),
                        validation_steps = nb_batch, 
                        epochs=epochs, 
                        callbacks=[checkpointer])
    


In [228]:
model = deviation_network(input_shape, network_depth)
model.load_weights(model_name)
scoring_network = Model(inputs=model.input, outputs=model.output)    
scores = scoring_network.predict(x_test)

In [229]:
model_name

'./model/devnet_noise0.02cr_1024bs_3000ko_2d.h5'

In [230]:
test_scores = scoring_network.predict(test_x)
test_scores = test_scores.reshape(-1)
train_scores = scoring_network.predict(train_x)
train_scores = train_scores.reshape(-1)

In [231]:
test_score, test_table = get_result_devnet(test_y, test_scores)
train_score, train_table = get_result_devnet(train_y, train_scores)

In [232]:
model_name

'./model/devnet_noise0.02cr_1024bs_3000ko_2d.h5'

In [233]:
test_table

Unnamed: 0_level_0,group,labels,labels,labels,scores,scores,scores,prediction,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,1766,104.0,0.05889,1766,8928.342773,5.055686,1766,528,0.298981
1,G02,1766,38.0,0.021518,1766,7569.84375,4.286435,1766,0,0.0
2,G03,1766,24.0,0.01359,1766,5234.384766,2.963978,1766,0,0.0
3,G04,1766,13.0,0.007361,1766,1506.973022,0.853326,1766,0,0.0
4,G05,1766,10.0,0.005663,1766,943.907288,0.534489,1766,0,0.0
5,G06,1766,15.0,0.008494,1766,531.368286,0.300888,1766,0,0.0
6,G07,1766,5.0,0.002831,1766,215.712112,0.122147,1766,0,0.0
7,G08,1766,11.0,0.006229,1766,36.181412,0.020488,1766,0,0.0
8,G09,1766,7.0,0.003964,1766,-79.894852,-0.045241,1766,0,0.0
9,G10,1767,2.0,0.001132,1767,-238.821518,-0.135156,1767,0,0.0


In [234]:
train_table

Unnamed: 0_level_0,group,labels,labels,labels,scores,scores,scores,prediction,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,11889,1146.0,0.096392,11889,60393.867188,5.079811,11889,3513,0.295483
1,G02,11889,572.0,0.048112,11889,50806.269531,4.273385,11889,0,0.0
2,G03,11889,473.0,0.039785,11889,32972.636719,2.773373,11889,0,0.0
3,G04,11889,290.0,0.024392,11889,9849.304688,0.828438,11889,0,0.0
4,G05,11889,288.0,0.024224,11889,6442.107422,0.541854,11889,0,0.0
5,G06,11889,244.0,0.020523,11889,3794.081543,0.319125,11889,0,0.0
6,G07,11889,248.0,0.02086,11889,1635.181641,0.137537,11889,0,0.0
7,G08,11889,180.0,0.01514,11889,274.813873,0.023115,11889,0,0.0
8,G09,11889,97.0,0.008159,11889,-521.444214,-0.043859,11889,0,0.0
9,G10,11887,51.0,0.00429,11887,-1521.612915,-0.128006,11887,0,0.0


In [235]:
# 충격 validation을 쓰면 성능이 더 안좋음