In [1]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [2]:
df = pd.read_csv('eventlog_gui9.csv')
print(df) 

       eventID        account  \
0         4688        dcadmin   
1         4674         hacked   
2         5140            dc$   
3         5140            dc$   
4         5140            dc$   
5         5140            dc$   
6         5140            dc$   
7         5140            dc$   
8         5140        fsadmin   
9         5140        fsadmin   
10        5140        fsadmin   
11        5140        fsadmin   
12        5140            dc$   
13        5140            dc$   
14        5140            dc$   
15        5140            dc$   
16        5140  local service   
17        5140        filesv$   
18        5140        filesv$   
19        5140            dc$   
20        5140            dc$   
21        5140  local service   
22        5140        filesv$   
23        5140        filesv$   
24        5140            dc$   
25        5140            dc$   
26        5140            dc$   
27        5140            dc$   
28        5140            dc$   
29        

One-class SVM

In [3]:
def learning(eventid, df, nu, gamma):
    
    df = df[df.eventID == eventid]
    data_dummies = pd.get_dummies(df.iloc[:,1:])
    data_dummies = pd.concat([df.iloc[:,0], data_dummies], axis=1)
    data_dummies.to_csv('data_dummies_' + str(eventid) + '.csv')

    if 'train' not in df.target.values:
        print('No train value in the target column')
        print('')
        return

    if 'test' not in df.target.values:
        print('No test value in the target column')
        print('')
        return
    
    if 'outlier' not in df.target.values:
        print('No outlier value in the target column')
        print('')
        return

    data_normal = data_dummies[data_dummies.target_train == 1]
    data_test = data_dummies[data_dummies.target_test == 1]
    data_outliers = data_dummies[data_dummies.target_outlier == 1]
    X_train = data_normal.ix[:, :-3].values
    X_test = data_test.ix[:, :-3].values
    X_outliers = data_outliers.ix[:, :-3].values
    
    X_all = data_dummies.ix[:, :-3].values
    X_index = data_dummies.ix[:, -3:].values
    
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    clf.fit(X_train)
    
    #n_correct_test is True Negative
    #n_error_test is False Positive
    #n_correct_outliers is True Positive
    #n_error_outliers is False Negative

    X_pred_train = clf.predict(X_train)
    X_pred_test = clf.predict(X_test)
    X_pred_outliers = clf.predict(X_outliers)
    n_correct_train = X_pred_train[X_pred_train == 1].size
    n_error_train = X_pred_train[X_pred_train == -1].size
    n_correct_test = X_pred_test[X_pred_test == 1].size
    n_error_test = X_pred_test[X_pred_test == -1].size
    n_correct_outliers = X_pred_outliers[X_pred_outliers == -1].size
    n_error_outliers = X_pred_outliers[X_pred_outliers == 1].size
    recall = n_correct_outliers / (n_correct_outliers + n_error_outliers)
    precision = n_correct_outliers / (n_correct_outliers + n_error_test)
    specificity = n_correct_test / (n_correct_test + n_error_test)
    accuracy = (n_correct_test + n_correct_outliers) / (n_correct_test + n_error_test + n_correct_outliers + n_error_outliers)
    f_value = (2 * n_correct_outliers) / (2 * n_correct_outliers + n_error_test + n_error_outliers)
    
    print('svm.OneClassSVM(nu=' + str(nu) + ', kernel="rbf", gamma=' + str(gamma) + ')')
    print('Training Correct: ' + str(n_correct_train))
    print('Training Error: ' + str(n_error_train))
    print('True Negative: ' + str(n_correct_test))
    print('False Positive: ' + str(n_error_test))
    print('True Positive: ' + str(n_correct_outliers))
    print('False Negative: ' + str(n_error_outliers))
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    print('Specificity: ' + str(specificity))
    print('Accuracy: ' + str(accuracy))
    print('F_Value: ' + str(f_value))
    print('N: ' + str(n_correct_train+n_error_train+n_correct_test+n_error_test+n_correct_outliers+n_error_outliers))
    print('')
    
    X_train_result = np.concatenate((df[df['target'] == 'train'], X_pred_train[np.newaxis, :].T), axis=1)
    X_test_result = np.concatenate((df[df['target'] == 'test'], X_pred_test[np.newaxis, :].T), axis=1)
    X_outliers_result = np.concatenate((df[df['target'] == 'outlier'], X_pred_outliers[np.newaxis, :].T), axis=1)

    with open('X_train_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_train_result)
    
    with open('X_test_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_test_result)

    with open('X_outliers_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_outliers_result)
        
    #print('PCA')
    #pca = PCA(n_components=2)
    #X_pca = pca.fit_transform(X_all)

    #plt.figure()
    #plt.scatter(X_pca[:,0], X_pca[:,1], c=X_index)
    #plt.title('Red:' + data_dummies.columns[-3] + '  Green:' + data_dummies.columns[-2] + '  Blue:' + data_dummies.columns[-1])
    #plt.show()

    joblib.dump(clf, 'ocsvm_gt_' + str(eventid) + '.pkl') 
    

In [4]:
nu_list = [0.1, 0.01, 0.001]
gamma_list = [0.1, 0.01, 0.001]

for nu in nu_list:
    for gamma in gamma_list:
        #learning(4672, df, nu, gamma)
        #learning(4673, df, nu, gamma)
        #learning(4674, df, nu, gamma)
        learning(4688, df, nu, gamma)
        #learning(4768, df, nu, gamma)
        #learning(4769, df, nu, gamma)
        #learning(5140, df, nu, gamma)

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
Training Correct: 2103
Training Error: 524
True Negative: 694
False Positive: 183
True Positive: 96
False Negative: 5
Recall: 0.9504950495049505
Precision: 0.34408602150537637
Specificity: 0.7913340935005702
Accuracy: 0.8077709611451943
F_Value: 0.5052631578947369
N: 3605

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01)
Training Correct: 1944
Training Error: 683
True Negative: 726
False Positive: 151
True Positive: 98
False Negative: 3
Recall: 0.9702970297029703
Precision: 0.39357429718875503
Specificity: 0.8278221208665907
Accuracy: 0.8425357873210634
F_Value: 0.56
N: 3605

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.001)
Training Correct: 2164
Training Error: 463
True Negative: 757
False Positive: 120
True Positive: 96
False Negative: 5
Recall: 0.9504950495049505
Precision: 0.4444444444444444
Specificity: 0.863169897377423
Accuracy: 0.8721881390593047
F_Value: 0.6056782334384858
N: 3605

svm.OneClassSVM(nu=0.01, kernel="rbf", gamma

In [20]:
learning(4688, df, 0.01, 0.1)

svm.OneClassSVM(nu=0.01, kernel="rbf", gamma=0.1)
Training Correct: 2384
Training Error: 243
True Negative: 822
False Positive: 59
True Positive: 90
False Negative: 7
Recall: 0.9278350515463918
Precision: 0.6040268456375839
Specificity: 0.9330306469920545
Accuracy: 0.9325153374233128
F_Value: 0.7317073170731707
N: 3605



In [22]:
learning(4674, df, 0.001, 0.1)

svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.1)
Training Correct: 401
Training Error: 72
True Negative: 45
False Positive: 5
True Positive: 143
False Negative: 0
Recall: 1.0
Precision: 0.9662162162162162
Specificity: 0.9
Accuracy: 0.9740932642487047
F_Value: 0.9828178694158075
N: 666

