# Outliers detection using AutoEncoders

In [12]:
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.model_selection import KFold
import scipy.io

%run ../util/load_data.py
%run ../util/data_visualization.py
%run ../util/outliers_statistics.py


## Data loading

In [2]:
data = load_data("../data")
mat = scipy.io.loadmat('../data/cover.mat')
df = pd.DataFrame(mat['X'], columns=data['numerical_attributes'])
target = mat['y']
df['target'] = target

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp


## Outliers detection with Auto-Encoders

### Network construction

In [9]:
nb_epoch = 10
batch_size = 128
input_dim = len(data['numerical_attributes']) #num of columns, 30
encoding_dim = 14
hidden_dim = int(encoding_dim / 2) #i.e. 7
learning_rate = 1e-5

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
encoder = Dense(hidden_dim, activation="relu")(encoder)
decoder = Dense(hidden_dim, activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)



### Outliers detection

In [10]:
kf = KFold(n_splits=2)

outliers_ids = []

for train, test in kf.split(df):
    train_data = df.iloc[train, :]
    test_data = df.iloc[test, :]
    
    
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')
    
    autoencoder.fit(train_data[data['numerical_attributes']], train_data[data['numerical_attributes']],
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=1)
    
    test_data_predictions = autoencoder.predict(test_data[data['numerical_attributes']])
    
    mse = np.mean(np.power(test_data[data['numerical_attributes']] - test_data_predictions, 2), axis=1)
    mse_mean = np.mean(mse)
    
    prediction_result = zip(
        test_data.index.values.tolist(),
        mse
    )
    
    outliers_ids += [id for id, distance in prediction_result if distance > mse_mean * 3]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
precision, recall, F1_score = outliers_statistics(df, outliers_ids)
print("Precision: "+ str(precision))
print("Recall: "+ str(recall))
print("F1-score: "+ str(F1_score))

Target number of outliers: 2747
Found number of outliers: 10571
TP: 0
FP: 10571
FN: 2747
Precision: 0.0
Recall: 0.0
F1-score: 0
