# Dataset variations

In [1]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import sys
sys.path.append('../utils')
from utils import perf

In [32]:
perf('../../dumps/dataset_variations/control_8000_false_3.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.627
Accuracy on test set: 0.607


In [33]:
perf('../../dumps/dataset_variations/control_8000_false_3.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.869
Accuracy on test set: 0.851


## Different thresholds


### Gaussian

In [34]:
perf('../../dumps/dataset_variations/default_1.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.438
Accuracy on test set: 0.430


In [35]:
perf('../../dumps/dataset_variations/default_2.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.627
Accuracy on test set: 0.613


In [36]:
perf('../../dumps/dataset_variations/default_4.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.864
Accuracy on test set: 0.851


In [37]:
perf('../../dumps/dataset_variations/default_5.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.864
Accuracy on test set: 0.851


### Bernoulli

In [38]:
perf('../../dumps/dataset_variations/default_1.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.893
Accuracy on test set: 0.894


In [39]:
perf('../../dumps/dataset_variations/default_2.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.889
Accuracy on test set: 0.874


In [40]:
perf('../../dumps/dataset_variations/default_4.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.949
Accuracy on test set: 0.946


In [41]:
perf('../../dumps/dataset_variations/default_5.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.949
Accuracy on test set: 0.946


## Errors considered as positive result from detector


### Gaussian

In [42]:
perf('../../dumps/dataset_variations/error_as_packed.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.616
Accuracy on test set: 0.605


### Bernoulli

In [43]:
perf('../../dumps/dataset_variations/error_as_packed.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.889
Accuracy on test set: 0.874


## Check influence of each detector

### Gaussian

In [2]:
perf('../../dumps/dataset_variations/not_die.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.615
Accuracy on test set: 0.589


In [3]:
perf('../../dumps/dataset_variations/not_cisco.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.898
Accuracy on test set: 0.894


In [46]:
perf('../../dumps/dataset_variations/not_manalyze.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.842
Accuracy on test set: 0.826


In [47]:
perf('../../dumps/dataset_variations/not_peid.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.863
Accuracy on test set: 0.848


In [48]:
perf('../../dumps/dataset_variations/not_peframe.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.865
Accuracy on test set: 0.852


### Bernoulli

In [49]:
perf('../../dumps/dataset_variations/not_die.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.872
Accuracy on test set: 0.849


In [50]:
perf('../../dumps/dataset_variations/not_cisco.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.926
Accuracy on test set: 0.921


In [51]:
perf('../../dumps/dataset_variations/not_manalyze.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.939
Accuracy on test set: 0.936


In [52]:
perf('../../dumps/dataset_variations/not_peframe.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.950
Accuracy on test set: 0.947


In [53]:
perf('../../dumps/dataset_variations/not_peid.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.946
Accuracy on test set: 0.943


## Only boolean values

### Gaussian

In [54]:
perf('../../dumps/dataset_variations/control_8000_false_3.csv','gaussian',True)

Most important : 

Accuracy on training set: 0.753
Accuracy on test set: 0.731
---------------

All features : 

Accuracy on training set: 0.768
Accuracy on test set: 0.751


### Bernoulli

In [55]:
perf('../../dumps/dataset_variations/control_8000_false_3.csv','bernoulli',True)

Most important : 

Accuracy on training set: 0.926
Accuracy on test set: 0.914
---------------

All features : 

Accuracy on training set: 0.926
Accuracy on test set: 0.918


## Agreement on same packer

### Gaussian

In [3]:
perf('../../dumps/dataset_variations/default_agreement.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.930
Accuracy on test set: 0.929


### Bernoulli

In [4]:
perf('../../dumps/dataset_variations/default_agreement.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.953
Accuracy on test set: 0.955


## Test with Thomas datasets

In [2]:
gt = pd.read_csv("../../dumps/thomas_datasets/2019-08.Merged_thomas.csv")
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

tree = GaussianNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

tree = BernoulliNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

for col in cols:
    gt = gt.drop(gt[gt[col] < 0 ].index)
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = MultinomialNB()

tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.886
Accuracy on test set: 0.890
Accuracy on training set: 0.903
Accuracy on test set: 0.901
Accuracy on training set: 0.248
Accuracy on test set: 0.244


In [3]:
gt = pd.read_csv("../../dumps/thomas_datasets/2019-09.Merged_thomas.csv")
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

tree = GaussianNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

tree = BernoulliNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

for col in cols:
    gt = gt.drop(gt[gt[col] < 0 ].index)
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = MultinomialNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.989
Accuracy on test set: 0.986
Accuracy on training set: 0.933
Accuracy on test set: 0.930
Accuracy on training set: 0.223
Accuracy on test set: 0.218
