In [2]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [3]:
df = pd.read_csv('eventlog.csv')
print(df) 

       eventID        account                                        process  \
0         4688        dcadmin                     c:\windows\system32\at.exe   
1         4674         hacked               c:\windows\system32\services.exe   
2         5140            dc$                                            NaN   
3         5140            dc$                                            NaN   
4         5140            dc$                                            NaN   
5         5140            dc$                                            NaN   
6         5140            dc$                                            NaN   
7         5140            dc$                                            NaN   
8         5140        fsadmin                                            NaN   
9         5140        fsadmin                                            NaN   
10        5140        fsadmin                                            NaN   
11        5140        fsadmin           

One-class SVM

In [4]:
def learning(eventid, df, nu, gamma):
    
    df = df[df.eventID == eventid]
    data_dummies = pd.get_dummies(df.iloc[:,1:])
    data_dummies = pd.concat([df.iloc[:,0], data_dummies], axis=1)
    data_dummies.to_csv('data_dummies_' + str(eventid) + '.csv')

    if 'train' not in df.target.values:
        print('No train value in the target column')
        print('')
        return

    if 'test' not in df.target.values:
        print('No test value in the target column')
        print('')
        return
    
    if 'outlier' not in df.target.values:
        print('No outlier value in the target column')
        print('')
        return

    data_normal = data_dummies[data_dummies.target_train == 1]
    data_test = data_dummies[data_dummies.target_test == 1]
    data_outliers = data_dummies[data_dummies.target_outlier == 1]
    X_train = data_normal.ix[:, :-3].values
    X_test = data_test.ix[:, :-3].values
    X_outliers = data_outliers.ix[:, :-3].values
    
    X_all = data_dummies.ix[:, :-3].values
    X_index = data_dummies.ix[:, -3:].values
    
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    clf.fit(X_train)
    
    #n_correct_test is True Negative
    #n_error_test is False Positive
    #n_correct_outliers is True Positive
    #n_error_outliers is False Negative

    X_pred_train = clf.predict(X_train)
    X_pred_test = clf.predict(X_test)
    X_pred_outliers = clf.predict(X_outliers)
    n_correct_train = X_pred_train[X_pred_train == 1].size
    n_error_train = X_pred_train[X_pred_train == -1].size
    n_correct_test = X_pred_test[X_pred_test == 1].size
    n_error_test = X_pred_test[X_pred_test == -1].size
    n_correct_outliers = X_pred_outliers[X_pred_outliers == -1].size
    n_error_outliers = X_pred_outliers[X_pred_outliers == 1].size
    recall = n_correct_outliers / (n_correct_outliers + n_error_outliers)
    precision = n_correct_outliers / (n_correct_outliers + n_error_test)
    specificity = n_correct_test / (n_correct_test + n_error_test)
    accuracy = (n_correct_test + n_correct_outliers) / (n_correct_test + n_error_test + n_correct_outliers + n_error_outliers)
    f_value = (2 * n_correct_outliers) / (2 * n_correct_outliers + n_error_test + n_error_outliers)
    
    print('svm.OneClassSVM(nu=' + str(nu) + ', kernel="rbf", gamma=' + str(gamma) + ')')
    print('Training Correct: ' + str(n_correct_train))
    print('Training Error: ' + str(n_error_train))
    print('True Negative: ' + str(n_correct_test))
    print('False Positive: ' + str(n_error_test))
    print('True Positive: ' + str(n_correct_outliers))
    print('False Negative: ' + str(n_error_outliers))
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    print('Specificity: ' + str(specificity))
    print('Accuracy: ' + str(accuracy))
    print('F_Value: ' + str(f_value))
    print('N: ' + str(n_correct_train+n_error_train+n_correct_test+n_error_test+n_correct_outliers+n_error_outliers))
    print('')
    
    X_train_result = np.concatenate((df[df['target'] == 'train'], X_pred_train[np.newaxis, :].T), axis=1)
    X_test_result = np.concatenate((df[df['target'] == 'test'], X_pred_test[np.newaxis, :].T), axis=1)
    X_outliers_result = np.concatenate((df[df['target'] == 'outlier'], X_pred_outliers[np.newaxis, :].T), axis=1)

    with open('X_train_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_train_result)
    
    with open('X_test_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_test_result)

    with open('X_outliers_result' + str(eventid) + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(X_outliers_result)
        
    #print('PCA')
    #pca = PCA(n_components=2)
    #X_pca = pca.fit_transform(X_all)

    #plt.figure()
    #plt.scatter(X_pca[:,0], X_pca[:,1], c=X_index)
    #plt.title('Red:' + data_dummies.columns[-3] + '  Green:' + data_dummies.columns[-2] + '  Blue:' + data_dummies.columns[-1])
    #plt.show()

    joblib.dump(clf, 'ocsvm_gt_' + str(eventid) + '.pkl') 
    

In [7]:
nu_list = [0.1, 0.01, 0.001]
gamma_list = [0.1, 0.01, 0.001]

for nu in nu_list:
    for gamma in gamma_list:
        #learning(4672, df, nu, gamma)
        #learning(4673, df, nu, gamma)
        #learning(4674, df, nu, gamma)
        learning(4688, df, nu, gamma)
        #learning(4768, df, nu, gamma)
        #learning(4769, df, nu, gamma)
        #learning(5140, df, nu, gamma)

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
Training Correct: 1642
Training Error: 2039
True Negative: 669
False Positive: 208
True Positive: 99
False Negative: 2
Recall: 0.9801980198019802
Precision: 0.32247557003257327
Specificity: 0.7628278221208666
Accuracy: 0.7852760736196319
F_Value: 0.4852941176470588
N: 4659

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01)
Training Correct: 2974
Training Error: 707
True Negative: 618
False Positive: 259
True Positive: 95
False Negative: 6
Recall: 0.9405940594059405
Precision: 0.268361581920904
Specificity: 0.7046750285062714
Accuracy: 0.7290388548057259
F_Value: 0.4175824175824176
N: 4659

svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.001)
Training Correct: 2136
Training Error: 1545
True Negative: 723
False Positive: 154
True Positive: 96
False Negative: 5
Recall: 0.9504950495049505
Precision: 0.384
Specificity: 0.8244013683010262
Accuracy: 0.8374233128834356
F_Value: 0.5470085470085471
N: 4659

svm.OneClassSVM(nu=0.01, kernel="rbf", gam

In [45]:
learning(4688, df, 0.01, 0.001)

svm.OneClassSVM(nu=0.01, kernel="rbf", gamma=0.001)
Training Correct: 3574
Training Error: 107
True Negative: 867
False Positive: 10
True Positive: 90
False Negative: 11
Recall: 0.8910891089108911
Precision: 0.9
Specificity: 0.9885974914481186
Accuracy: 0.9785276073619632
F_Value: 0.8955223880597015
N: 4659



In [47]:
learning(4674, df, 0.001, 0.1)

svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.1)
Training Correct: 401
Training Error: 72
True Negative: 41
False Positive: 0
True Positive: 148
False Negative: 4
Recall: 0.9736842105263158
Precision: 1.0
Specificity: 1.0
Accuracy: 0.9792746113989638
F_Value: 0.9866666666666667
N: 666

