In [2]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [7]:
df = pd.read_csv('eventlog_psexe.csv')
print(df) 

                   date  eventID          account             ip  \
0       2018/04/03 0:00     5140              dc$           fe80   
1       2018/04/03 0:00     5140              dc$           fe80   
2       2018/04/03 0:00     5140              dc$           fe80   
3       2018/04/03 0:00     5140              dc$           fe80   
4       2018/04/03 0:02     5140              dc$           fe80   
5       2018/04/03 0:02     5140              dc$           fe80   
6       2018/04/03 0:03     5140          fsadmin  192.168.2.102   
7       2018/04/03 0:03     5140          fsadmin  192.168.2.102   
8       2018/04/03 0:03     5140          fsadmin  192.168.2.102   
9       2018/04/03 0:03     5140          fsadmin  192.168.2.102   
10      2018/04/03 0:03     5140              dc$           fe80   
11      2018/04/03 0:03     5140              dc$           fe80   
12      2018/04/03 0:03     5140              dc$           fe80   
13      2018/04/03 0:03     5140              dc

One-class SVM

In [8]:
def learning(eventid, df, nu, gamma):
    
    df = df[df.eventID == eventid]
    data_dummies = pd.get_dummies(df.iloc[:,1:])
    data_dummies = pd.concat([df.iloc[:,0], data_dummies], axis=1)
    data_dummies.to_csv('data_dummies_' + str(eventid) + '.csv')

    if 'train' not in df.target.values:
        print('No train value in the target column')
        print('')
        return

    if 'test' not in df.target.values:
        print('No test value in the target column')
        print('')
        return
    
    if 'outlier' not in df.target.values:
        print('No outlier value in the target column')
        print('')
        return

    data_normal = data_dummies[data_dummies.target_train == 1]
    data_test = data_dummies[data_dummies.target_test == 1]
    data_outliers = data_dummies[data_dummies.target_outlier == 1]
    X_train = data_normal.ix[:, 1:-3].values
    X_test = data_test.ix[:, 1:-3].values
    X_outliers = data_outliers.ix[:, 1:-3].values
    
    X_all = data_dummies.ix[:, 1:-3].values
    X_index = data_dummies.ix[:, -3:].values
    
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    clf.fit(X_train)
    
    #n_correct_test is True Negative
    #n_error_test is False Positive
    #n_correct_outliers is True Positive
    #n_error_outliers is False Negative

    X_pred_train = clf.predict(X_train)
    X_pred_test = clf.predict(X_test)
    X_pred_outliers = clf.predict(X_outliers)
    n_correct_train = X_pred_train[X_pred_train == 1].size
    n_error_train = X_pred_train[X_pred_train == -1].size
    n_correct_test = X_pred_test[X_pred_test == 1].size
    n_error_test = X_pred_test[X_pred_test == -1].size
    n_correct_outliers = X_pred_outliers[X_pred_outliers == -1].size
    n_error_outliers = X_pred_outliers[X_pred_outliers == 1].size
    recall = n_correct_outliers / (n_correct_outliers + n_error_outliers)
    precision = n_correct_outliers / (n_correct_outliers + n_error_test)
    specificity = n_correct_test / (n_correct_test + n_error_test)
    accuracy = (n_correct_test + n_correct_outliers) / (n_correct_test + n_error_test + n_correct_outliers + n_error_outliers)
    f_value = (2 * n_correct_outliers) / (2 * n_correct_outliers + n_error_test + n_error_outliers)
    
    print('svm.OneClassSVM(nu=' + str(nu) + ', kernel="rbf", gamma=' + str(gamma) + ')')
    print('Training Correct: ' + str(n_correct_train))
    print('Training Error: ' + str(n_error_train))
    print('True Negative: ' + str(n_correct_test))
    print('False Positive: ' + str(n_error_test))
    print('True Positive: ' + str(n_correct_outliers))
    print('False Negative: ' + str(n_error_outliers))
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    print('Specificity: ' + str(specificity))
    print('Accuracy: ' + str(accuracy))
    print('F_Value: ' + str(f_value))
    print('N: ' + str(n_correct_train+n_error_train+n_correct_test+n_error_test+n_correct_outliers+n_error_outliers))
    print('')
    
    X_train_result = np.concatenate((df[df['target'] == 'train'], X_pred_train[np.newaxis, :].T), axis=1)
    X_test_result = np.concatenate((df[df['target'] == 'test'], X_pred_test[np.newaxis, :].T), axis=1)
    X_outliers_result = np.concatenate((df[df['target'] == 'outlier'], X_pred_outliers[np.newaxis, :].T), axis=1)

    with open('X_train_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_train_result)
    
    with open('X_test_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_test_result)

    with open('X_outliers_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_outliers_result)
        
    #print('PCA')
    #pca = PCA(n_components=2)
    #X_pca = pca.fit_transform(X_all)

    #plt.figure()
    #plt.scatter(X_pca[:,0], X_pca[:,1], c=X_index)
    #plt.title('Red:' + data_dummies.columns[-3] + '  Green:' + data_dummies.columns[-2] + '  Blue:' + data_dummies.columns[-1])
    #plt.show()

    joblib.dump(clf, 'ocsvm_gt_' + str(eventid) + '.pkl') 
    

In [12]:
nu_list = [0.1, 0.01, 0.001]
gamma_list = [0.1, 0.01, 0.001]

for nu in nu_list:
    for gamma in gamma_list:
        #learning(4672, df, nu, gamma)
        #learning(4673, df, nu, gamma)
        learning(4674, df, nu, gamma)
        #learning(4688, df, nu, gamma)
        #learning(4768, df, nu, gamma)
        #learning(4769, df, nu, gamma)
        #learning(5140, df, nu, gamma)

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
Training Correct: 815
Training Error: 123
True Negative: 47
False Positive: 5
True Positive: 15
False Negative: 14
Recall: 0.5172413793103449
Precision: 0.75
Specificity: 0.9038461538461539
Accuracy: 0.7654320987654321
F_Value: 0.6122448979591837
N: 1019

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01)
Training Correct: 829
Training Error: 109
True Negative: 47
False Positive: 5
True Positive: 10
False Negative: 19
Recall: 0.3448275862068966
Precision: 0.6666666666666666
Specificity: 0.9038461538461539
Accuracy: 0.7037037037037037
F_Value: 0.45454545454545453
N: 1019

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.001)
Training Correct: 842
Training Error: 96
True Negative: 47
False Positive: 5
True Positive: 8
False Negative: 21
Recall: 0.27586206896551724
Precision: 0.6153846153846154
Specificity: 0.9038461538461539
Accuracy: 0.6790123456790124
F_Value: 0.38095238095238093
N: 1019

svm.OneClassSVM(nu=0.01, kernel="rbf", gamma=0.1)
Tra

In [15]:
learning(4688, df, 0.001, 0.1)

svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.1)
Training Correct: 1970
Training Error: 39
True Negative: 1448
False Positive: 26
True Positive: 8
False Negative: 8
Recall: 0.5
Precision: 0.23529411764705882
Specificity: 0.9823609226594301
Accuracy: 0.9771812080536912
F_Value: 0.32
N: 3499

