In [1]:
from random import *
from matplotlib import pyplot as plt
import numpy as np
import GPy

l_test_set_path = "../StarLightCurves/StarLightCurves_TEST"
m_test_set_path = "../MALLAT/MALLAT_TEST"

# two trained HGP models
l_model_class_names = [1,3]
m_model_class_names = [3,6]

# perc of Testing data used for HGP sample size
l_sample_size_perc = 0.5
m_sample_size_perc = 0.5

# outlier class names
l_outlier_class_names = [2]
m_outlier_class_names = [7]

# perc of normal and abnormal items
# normal_prec + outlier_prec = 1
# normal_perc = 0.95
# total_test_prec = 0.9


outlier_prec = 0.1


(Rebbapragada, 2008) mentioned that the shapes of CEPH and RRL are very similar, so they referred to CEPH and RRL as normal classes, and EB was regarded as the anomaly for the known anomalies test. In MALLAT dataset, class 3,6 are normal classes, and class 7,2 are abnormal ones. In this way, they compared the performance of PCAD (their method) with other methods (K-means etc.).

- S1: train two HGP models (model 1 uses CEPH and RRL as training dataset; model 2 uses MALLAT class 3,6 as training dataset)
- S2: build two testing datasets including 90% normal objectives and 10% outliers for OGLE and MALLAT respectively.
- S2: apply model 1 and model 2 to the testing datasets, and measure precision

In [2]:
# datapre: light_curve->l; MALLAT->m
def importModel(datapre, model_class_names, sample_size):
    if datapre == 'l':
        X_file_path = "./light_curve_model_files/X_" + str(model_class_names) + str(sample_size) + ".npy"
        Y_file_path = "./light_curve_model_files/Y_" + str(model_class_names)  + str(sample_size) + ".npy"
        model_path = "./light_curve_model_files/model_save" + str(model_class_names) + str(sample_size) + ".npy"
    else:
        X_file_path = "./mallat_model_files/X_" + str(model_class_names) + str(sample_size) + ".npy"
        Y_file_path = "./mallat_model_files/Y_" + str(model_class_names) + str(sample_size) + ".npy"
        model_path = "./mallat_model_files/model_save" + str(model_class_names) + str(sample_size) + ".npy"

    X_load = np.load(X_file_path)
    Y_load = np.load(Y_file_path)


    kern_class = GPy.kern.Matern32(input_dim=1, variance=1.5, lengthscale=2.5, active_dims=[0], name='class')
    kern_replicate = GPy.kern.Matern32(input_dim=1, variance=.1, lengthscale=2.5, active_dims=[0], name='replicate')
    k_hierarchy = GPy.kern.Hierarchical(kernels=[kern_class, kern_replicate])
    
    m_load = GPy.models.GPRegression(X_load, Y_load, initialize=False, kernel=k_hierarchy)
    m_load.update_model(False) # do not call the underlying expensive algebra on load
    m_load.initialize_parameter() # Initialize the parameters (connect the parameters up)
    m_load[:] = np.load(model_path) # Load the parameters
    m_load.update_model(True) # Call the algebra only once
#     print(m_load)
    return m_load, kern_class

In [3]:
def selectTopNormalIndex(model_class_names, top_rate):
    top_index = []
    for m_c in model_class_names:
        m_c_index_load = np.loadtxt("sorted_result_model-"+str(m_c)+"class-"+str(m_c)+".csv", delimiter=',', usecols=[0])
        m_c_top_rows_num = int(len(m_c_index_load) * top_rate)
        m_c_top_index_list = m_c_index_load[:m_c_top_rows_num:].tolist()
        top_index.extend(m_c_top_index_list)
    
#     print("top_index length=",len(top_index))
    return top_index


def processTestData(test_set_path):
    class_names_test = np.loadtxt(test_set_path, delimiter=',', usecols=[0])
    test_data = np.loadtxt(test_set_path, delimiter=',', usecols=range(1, 1025))
    test_data -= test_data.mean(1)[:,np.newaxis]
    test_data /= test_data.std(1)[:,np.newaxis]
    
    return class_names_test,test_data


# sample_prec doesn't work now
def generateTestData(class_test_names, sample_prec, model_class_names, outlier_class_names, normal_index_range=[],total_test_num=500):
#     print(outlier_class_names)
    if(normal_index_range):
        normal_indices = normal_index_range
    else:
        normal_indices = [i for i,cn in enumerate(class_test_names) if cn in model_class_names]
    abnormal_indices = [i for i,cn in enumerate(class_test_names) if cn in outlier_class_names]
    
#     total_test_indices_num = int((len(normal_indices) + len(abnormal_indices)) * total_test_prec)
# #     print(abnormal_indices)

#     normal_num = int(normal_perc * total_test_indices_num)
#     abnormal_num = int(outlier_prec * total_test_indices_num)
#     print(normal_num,abnormal_num)
#     print(len(normal_indices))

    normal_num = int((1-outlier_prec) * total_test_num)
    abnormal_num = int(outlier_prec * total_test_num)
    
    sample_normal_indices = sample(normal_indices, normal_num)
    sample_normal_indices = np.asarray(sample_normal_indices)
    sample_normal_indices = sample_normal_indices.reshape(-1,1)
    
    
    sample_abnormal_indices = sample(abnormal_indices, abnormal_num)
    sample_abnormal_indices = np.asarray(sample_abnormal_indices)
    sample_abnormal_indices = sample_abnormal_indices.reshape(-1,1)
    
#     print(sample_normal_indices, sample_abnormal_indices)
    
    # normal->0; abnormal->1
    normal_indicator = np.zeros(len(sample_normal_indices))
    normal_indicator = normal_indicator.reshape(-1,1)
    
    abnormal_indicator = np.ones(len(sample_abnormal_indices))
    abnormal_indicator = abnormal_indicator.reshape(-1,1)
    
#     print(sample_normal_indices.shape, normal_indicator.shape)
#     print(sample_abnormal_indices.shape, abnormal_indicator.shape)
    normal_array = np.concatenate((sample_normal_indices, normal_indicator), axis=1)
#     print(normal_array)
    abnormal_array = np.concatenate((sample_abnormal_indices, abnormal_indicator), axis=1)
#     print(abnormal_array)
    conbine_array = np.concatenate((normal_array, abnormal_array), axis=0)
#     print(conbine_array)
    
    return conbine_array, normal_num, abnormal_num

def sortLikelihood(test_array, likelihood):
    likelihood = np.asarray(likelihood)
    likelihood = likelihood.reshape(-1,1)

    combine_result = np.concatenate((test_array, likelihood), axis=1)
    sorted_result = np.sort(combine_result.view('f8,f8,f8'), order=['f2'], axis=0).view(np.float)
    
#     print(sorted_result)
    
#     result_file_name = "sorted_result_model.csv";
#     np.savetxt(result_file_name, sorted_result, delimiter=",",fmt='%d,%d,%1.9f')
    return sorted_result

def calLikelihood(test_data, indices, m_load, kern_class):
    indices = indices.tolist()
    log_pre_density_result = np.ones(len(indices)) * 9999
    log_pre_density_result = log_pre_density_result.reshape(-1,1)
    x_test = np.arange(1,1025)[:,None]
    mu_star, var_star = m_load.predict_noiseless(x_test, kern=kern_class)
    
    for index in range(len(indices)):
        y_test = test_data[int(indices[index]),:].reshape(-1,1)
        log_pre_density_result[index] = np.average(m_load.likelihood.log_predictive_density(y_test, mu_star, var_star))
        
    return log_pre_density_result


def calPrecision(sorted_result_array, abnormal_num):
    detect_abnormal_num_array = sorted_result_array[0:abnormal_num,:]
#     print(detect_abnormal_num_array.shape)
    detect_abnormal_num = np.sum(detect_abnormal_num_array[:,1])
    return detect_abnormal_num / abnormal_num

In [5]:
# S1: we've already trained those models and saved in files
# import hgp models
total_test_num = 5000
top_likelihood_rows_rate = 0.8
# for sample_size in [0.01,0.02,0.05,0.07,0.08,0.1]:
for sample_size in [0.08]:
    l_m, l_kern_class = importModel('l', l_model_class_names, sample_size)
    print(l_m)
    # S2: build testing datasets
    l_class_names_test, l_test_data = processTestData(l_test_set_path)

    # S3: applying HGP models to each testing dataset
    l_normal_index_range = selectTopNormalIndex(l_model_class_names, top_likelihood_rows_rate)
    l_normal_index_range = [int(i) for i in l_normal_index_range]

    l_test_array, l_normal_num, l_abnormal_num = generateTestData(l_class_names_test, l_sample_size_perc, l_model_class_names, l_outlier_class_names, l_normal_index_range,total_test_num)
    l_likelihood = calLikelihood(l_test_data, l_test_array[:,0], l_m, l_kern_class)
    l_sorted_result_array = sortLikelihood(l_test_array, l_likelihood)
    np.savetxt("l_sorted_result_array.csv", l_sorted_result_array, delimiter=",",fmt='%d,%d,%1.9f')
    l_precision = calPrecision(l_sorted_result_array, l_abnormal_num)
    print('sample_size=',sample_size,' total_test_num=',total_test_num,' normal_num=',l_normal_num,' abnormal_num=',l_abnormal_num,' precision=',l_precision)







Name : GP regression
Objective : -13152.052365915548
Number of Parameters : 5
Number of Optimization Parameters : 5
Updates : True
Parameters:
  [1mGP_regression.                 [0;0m  |              value  |  constraints  |  priors
  [1mhierarchy.class.variance       [0;0m  |  8.19095859429e-09  |      +ve      |        
  [1mhierarchy.class.lengthscale    [0;0m  |      21.9860116073  |      +ve      |        
  [1mhierarchy.replicate.variance   [0;0m  |      2.61135544218  |      +ve      |        
  [1mhierarchy.replicate.lengthscale[0;0m  |      295.626067312  |      +ve      |        
  [1mGaussian_noise.variance        [0;0m  |  1.8018422551e-120  |      +ve      |        
sample_size= 0.08  total_test_num= 5000  normal_num= 4500  abnormal_num= 500  precision= 0.662


In [14]:
total_test_num = 600
# for sample_size in [0.01,0.02,0.05,0.07,0.08,0.1]:
for sample_size in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    m_m, m_kern_class = importModel('m', m_model_class_names,sample_size)
    m_class_names_test, m_test_data = processTestData(m_test_set_path)
    
    m_test_array, m_normal_num, m_abnormal_num = generateTestData(m_class_names_test, m_sample_size_perc, m_model_class_names, m_outlier_class_names,[],total_test_num)
    m_likelihood = calLikelihood(m_test_data, m_test_array[:,0], m_m, m_kern_class)
    m_sorted_result_array = sortLikelihood(m_test_array, m_likelihood)
    np.savetxt("m_sorted_result_array.csv", m_sorted_result_array, delimiter=",",fmt='%d,%d,%1.9f')
    m_precision = calPrecision(m_sorted_result_array, m_abnormal_num)
    print('sample_size=',sample_size,' total_test_num=',total_test_num,' normal_num=',m_normal_num,' abnormal_num=',m_abnormal_num,' precision=',m_precision)



sample_size= 0.1  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 0.966666666667
sample_size= 0.2  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.3  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.4  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.5  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.6  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.7  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.8  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 0.9  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0
sample_size= 1  total_test_num= 600  normal_num= 540  abnormal_num= 60  precision= 1.0


sorted_result_array format

| index                               | indicator               | log_predictive_density |
|-------------------------------------|-------------------------|:----------------------:|
| sampled indexes of  testing dataset | normal(0) / abnormal(1) |                        |

test_array format

| index                               | indicator               |
|-------------------------------------|-------------------------|
| sampled indexes of  testing dataset | normal(0) / abnormal(1) |