In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
print('This script trains a sample classifier (using simple RandomForestClassifier) '
      'on CERT dataset. By default it takes CERT r4.2 extracted day data '
      ', train on data of 400 users in first half of the dataset, '
      'then output classification report (instance-based)')

In [2]:
#read dataset
data = pd.read_csv('best_features_dataset.csv')
# removed_cols = ['user','day','week','starttime','endtime','sessionid','insider']
# x_cols = [i for i in data.columns if i not in removed_cols]

In [3]:
shuffled_data = data.sample(frac=1)

In [4]:
run = 1
np.random.seed(run)

In [5]:
X = shuffled_data.drop('insider',axis=1)

In [6]:
shuffled_data["insider"] = np.where(shuffled_data["insider"] > 0, 1, shuffled_data["insider"])

In [7]:
Y = shuffled_data["insider"]

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [9]:
X_train

Unnamed: 0,email_n-pc2,email_send_mail_n-pc2,usb_mean_usb_dur,workhouremail_n-pc2,n_usb,usb_mean_file_tree_len,workhouremail_send_mail_n-pc2,workhourusb_n-pc0,workhourusb_mean_usb_dur,usb_n-pc0,n_workhourusb,http_leakf_mean_url_len,http_n-pc0,day
306888,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,29,227
78891,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,95,60
258585,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,95,184
534265,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,48,394
517901,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,10,385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403681,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,95,297
519346,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,95,387
119011,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,29,85
591776,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,162,442


In [10]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers import RepeatVector
from keras.layers import Dropout
from keras.layers import TimeDistributed
from keras.models import Model
from keras.layers import Input

In [11]:
X_train.shape

(485554, 14)

In [12]:
numx = X_train.to_numpy()
numx.shape
x_train = np.reshape(numx, (numx.shape[0],numx.shape[1],1))

In [13]:
x_train.shape

(485554, 14, 1)

In [14]:
Y_train.shape

(485554,)

In [15]:
numy = Y_train.to_numpy()
numy.shape
# y_train = np.reshape(numy, (numx.shape[0],numx.shape[1],1))

(485554,)

In [16]:
y_train = np.reshape(numy, (numy.shape[0],1))

In [17]:
y_train.shape

(485554, 1)

In [18]:
# Define LSTM autoencoder architecture
inputs = Input(shape=(X_train.shape[1], 1))
encoded = LSTM(32)(inputs)
decoded = RepeatVector(X_train.shape[1])(encoded)
decoded = LSTM(32, return_sequences=True)(decoded)
decoded = Dense(1, activation='sigmoid')(decoded)

autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [19]:
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 14, 1)]           0         
                                                                 
 lstm (LSTM)                 (None, 32)                4352      
                                                                 
 repeat_vector (RepeatVector  (None, 14, 32)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 14, 32)            8320      
                                                                 
 dense (Dense)               (None, 14, 1)             33        
                                                                 
Total params: 12,705
Trainable params: 12,705
Non-trainable params: 0
_________________________________________________________

In [20]:
numtx = X_test.to_numpy()
numtx.shape
x_test = np.reshape(numtx, (numtx.shape[0],numtx.shape[1],1))

In [21]:
X_test.shape

(208095, 14)

In [22]:
x_test.shape

(208095, 14, 1)

In [23]:
numty = Y_test.to_numpy()
numty.shape


(208095,)

In [24]:
y_test = np.reshape(numty, (numty.shape[0],1))

In [25]:
y_test.shape

(208095, 1)

In [26]:
autoencoder.fit(x_train, y_train, epochs = 100, batch_size = 1000,validation_data=(x_test,y_test))



<keras.callbacks.History at 0x133a1880520>

In [27]:
# Evaluate LSTM autoencoder
score = autoencoder.evaluate(x_test, y_test, batch_size=32)
print('Test loss:', score)

Test loss: [0.013418491929769516, 0.9981498718261719]


In [28]:
y_pred_test = autoencoder.predict(x_test)



In [29]:
y_pred_train = autoencoder.predict(x_train)



In [89]:
y_pred_test.shape

(208095, 14, 1)

In [31]:
train_mae_loss = np.mean(np.abs(y_pred_train - x_train), axis=1)

# plt.hist(train_mae_loss, bins=50)
# plt.xlabel('Train MAE loss')
# plt.ylabel('Number of Samples');

threshold = np.max(train_mae_loss)
print(f'Reconstruction error threshold: {threshold}')

Reconstruction error threshold: 71.07311709987698


In [33]:
test_mae_loss = np.mean(np.abs(y_pred_test - x_test), axis=1)
# np.max(test_mae_loss)
print(f'Validation Reconstruction error loss: {np.max(test_mae_loss)}')

Validation Reconstruction error loss: 70.21597422212861


In [None]:
# Setting Reconstruction error treshold as 71.073 for future pedictions

In [91]:
autoencoder.save("model")



INFO:tensorflow:Assets written to: model\assets


INFO:tensorflow:Assets written to: model\assets


In [93]:
preds = np.argmax(autoencoder.predict(x_test), axis=1)



In [98]:
preds

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)