In [None]:
!pip install tensorflow==1.14.0 
!pip install gast==0.2.2

In [None]:
import random
import collections
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import preprocessing, metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import warnings
import itertools
from PIL import Image
warnings.filterwarnings("ignore")
%matplotlib inline


In [None]:
print(tf.__version__)
tf.reset_default_graph()

In [None]:
data = pd.read_csv(r"../input/hr2u-vae/pulsar_stars.csv")
# data = pd.read_csv("../input/hr2u-vae/HTRU_2.csv")

data.head()

In [None]:
# for n, i in enumerate(data):
#     if i == 1:
#         data[n] = -1
# print(data)

data['target_class'] = data['target_class'].replace([1],-1)
data['target_class'] = data['target_class'].replace([0],1)

In [None]:
data['target_class'].value_counts()


In [None]:
data.shape

In [None]:
data.columns = data.columns.str.strip()
data.columns = ['IP Mean', 'IP Sd', 'IP Kurtosis', 'IP Skewness', 
              'DM-SNR Mean', 'DM-SNR Sd', 'DM-SNR Kurtosis', 'DM-SNR Skewness', 'target_class']

In [None]:
# view the percentage distribution of target_class column
data['target_class'].value_counts()/np.float(len(data))

In [None]:
X = data.drop(['target_class'], axis=1)

y = data['target_class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 10)
X_test_label = y_test
print(X_train.shape)
print(X_test.shape)
print(X_test_label.shape)


In [None]:
X_train.info()

In [None]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test =X_test.to_numpy()
X_test_label =X_test_label.to_numpy()
print(X_train.shape)
print(X_test.shape)
print(X_test_label.shape)


In [None]:
print(X_train)
print("*"*30)
print(X_test_label)
print("*"*30)
print(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
"""
MLP Variational AutoEncoder for Anomaly Detection
reference: https://pdfs.semanticscholar.org/0611/46b1d7938d7a8dae70e3531a00fceb3c78e8.pdf
"""



def lrelu(x, leak=0.2, name='lrelu'):
	return tf.maximum(x, leak*x)


def build_dense(input_vector,unit_no,activation):    
    return tf.layers.dense(input_vector,unit_no,activation=activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.zeros_initializer())

class MLP_VAE:
    def __init__(self,input_dim,lat_dim, outliers_fraction):
       # input_paras:
           # input_dim: input dimension for X
           # lat_dim: latent dimension for Z
           # outliers_fraction: pre-estimated fraction of outliers in trainning dataset
        
        self.outliers_fraction = outliers_fraction # for computing the threshold of anomaly score       
        self.input_dim = input_dim
        self.lat_dim = lat_dim # the lat_dim can exceed input_dim    
        
        self.input_X = tf.placeholder(tf.float32,shape=[None,self.input_dim],name='source_x')
        
        self.learning_rate = 0.0005
        self.batch_size =  32
        # batch_size should be smaller than normal setting for getting
        # a relatively lower anomaly-score-threshold
        self.train_iter = 8000
        self.hidden_units = 128
        self._build_VAE()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.pointer = 0
        
    def _encoder(self):
        with tf.variable_scope('encoder',reuse=tf.AUTO_REUSE):
            l1 = build_dense(self.input_X,self.hidden_units,activation=lrelu)
#            l1 = tf.nn.dropout(l1,0.8)
            l2 = build_dense(l1,self.hidden_units,activation=lrelu)
#            l2 = tf.nn.dropout(l2,0.8)          
            mu = tf.layers.dense(l2,self.lat_dim)
            sigma = tf.layers.dense(l2,self.lat_dim,activation=tf.nn.softplus)
            sole_z = mu + sigma *  tf.random_normal(tf.shape(mu),0,1,dtype=tf.float32)
        return mu,sigma,sole_z
        
    def _decoder(self,z):
        with tf.variable_scope('decoder',reuse=tf.AUTO_REUSE):
            l1 = build_dense(z,self.hidden_units,activation=lrelu)
#            l1 = tf.nn.dropout(l1,0.8)
            l2 = build_dense(l1,self.hidden_units,activation=lrelu)
#            l2 = tf.nn.dropout(l2,0.8)
            recons_X = tf.layers.dense(l2,self.input_dim)
        return recons_X


    def _build_VAE(self):
        self.mu_z,self.sigma_z,sole_z = self._encoder()
        self.recons_X = self._decoder(sole_z)
        
        with tf.variable_scope('loss'):
            KL_divergence = 0.5 * tf.reduce_sum(tf.square(self.mu_z) + tf.square(self.sigma_z) - tf.log(1e-8 + tf.square(self.sigma_z)) - 1, 1)
            mse_loss = tf.reduce_sum(tf.square(self.input_X-self.recons_X), 1)          
            self.all_loss =  mse_loss
            self.loss = tf.reduce_mean(mse_loss + KL_divergence)

            
        with tf.variable_scope('train'):            
            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            

    def _fecth_data(self,input_data):        
        if (self.pointer+1) * self.batch_size  >= input_data.shape[0]:
            return_data = input_data[self.pointer*self.batch_size:,:]
            self.pointer = 0
        else:
            return_data =  input_data[ self.pointer*self.batch_size:(self.pointer+1)*self.batch_size,:]
            self.pointer = self.pointer + 1
        return return_data
    
     

    def train(self,train_X):
        for index in range(self.train_iter):
            this_X = self._fecth_data(train_X)
            self.sess.run([self.train_op],feed_dict={
                        self.input_X: this_X
                        })
#             print(f'iter:{index}, loss:{[self.train_op]}')
        self.arrage_recons_loss(train_X)
        
    def arrage_recons_loss(self,input_data):
        all_losses =  self.sess.run(self.all_loss,feed_dict={
                self.input_X: input_data                  
                })
        self.judge_loss = np.percentile(all_losses,(1-self.outliers_fraction)*100)
                

    def judge(self,input_data):
        return_label = []
        for index in range(input_data.shape[0]):
            single_X = input_data[index].reshape(1,-1)
            this_loss = self.sess.run(self.loss,feed_dict={
                    self.input_X: single_X                  
                    })
            if this_loss < self.judge_loss:
                return_label.append(1)
            else:
                return_label.append(-1)
        return return_label
       
def plot_confusion_matrix(y_true, y_pred, labels,title):
    cmap = plt.cm.binary
    cm = confusion_matrix(y_true, y_pred)
    tick_marks = np.array(range(len(labels))) + 0.5
    np.set_printoptions(precision=2)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(4, 2), dpi=120)
    ind_array = np.arange(len(labels))
    x, y = np.meshgrid(ind_array, ind_array)
    intFlag = 0 
    for x_val, y_val in zip(x.flatten(), y.flatten()):
        #

        if (intFlag):
            c = cm[y_val][x_val]
            plt.text(x_val, y_val, "%d" % (c,), color='red', fontsize=8, va='center', ha='center')

        else:
            c = cm_normalized[y_val][x_val]
            if (c > 0.01):
                plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=7, va='center', ha='center')
            else:
                plt.text(x_val, y_val, "%d" % (0,), color='red', fontsize=7, va='center', ha='center')
    if(intFlag):
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
    else:
        plt.imshow(cm_normalized, interpolation='nearest', cmap=cmap)
    plt.gca().set_xticks(tick_marks, minor=True)
    plt.gca().set_yticks(tick_marks, minor=True)
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.grid(True, which='minor', linestyle='-')
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.title(title)
    plt.colorbar()
    xlocations = np.array(range(len(labels)))
    plt.xticks(xlocations, labels)
    plt.yticks(xlocations, labels)
    plt.ylabel('Index of True Classes')
    plt.xlabel('Index of Predict Classes')
    plt.show()
 
def mlp_vae_predict(train,test,test_label):
    mlp_vae = MLP_VAE(8,20,0.07)
    mlp_vae.train(train)
    mlp_vae_predict_label = mlp_vae.judge(test)
    print(collections.Counter(test_label))
    print(collections.Counter(mlp_vae_predict_label))
#     print(confusion_matrix(test_label, mlp_vae_predict_label))
    print(metrics.roc_auc_score(test_label, mlp_vae_predict_label ))
    plot_confusion_matrix(test_label, mlp_vae_predict_label, ['anomaly','normal'],'MLP_VAE Confusion-Matrix')
    confusion_matrix(test_label, mlp_vae_predict_label)

def iforest_predict(train,test,test_label):
    from sklearn.ensemble import IsolationForest
    iforest = IsolationForest(max_samples = 'auto',
                                 behaviour="new",contamination=0.01)

    iforest.fit(train)
    iforest_predict_label = iforest.predict(test)
    print(collections.Counter(test_label))
    print(collections.Counter(iforest_predict_label))
    print(metrics.roc_auc_score(test_label, iforest_predict_label ))
    plot_confusion_matrix(test_label, iforest_predict_label, ['anomaly','normal'],'iforest Confusion-Matrix')
    
def random_predict(train,y_train,test,test_label):
    from sklearn.ensemble import RandomForestClassifier
    rforest = RandomForestClassifier()

    rforest.fit(train,y_train)
    rforest_predict_label = rforest.predict(test)
    print(collections.Counter(test_label))
    print(collections.Counter(rforest_predict_label))
    print(metrics.roc_auc_score(test_label, rforest_predict_label ))
    plot_confusion_matrix(test_label, rforest_predict_label, ['anomaly','normal'],'iforest Confusion-Matrix')

def lof_predict(train,test,test_label):
    from sklearn.neighbors import LocalOutlierFactor
    lof = LocalOutlierFactor(novelty=True,contamination=0.01)
    lof.fit(train)
    lof_predict_label = lof.predict(test)
    print(collections.Counter(test_label))
    print(collections.Counter(lof_predict_label))
    print(metrics.roc_auc_score(test_label, lof_predict_label ))
    plot_confusion_matrix(test_label, lof_predict_label, ['anomaly','normal'],'LOF Confusion-Matrix')

if __name__ == '__main__':
    mlp_vae_predict(X_train,X_test,X_test_label)
    iforest_predict(X_train,X_test,X_test_label)
    lof_predict(X_train,X_test,X_test_label)
    





