# Dataset variations

In [2]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import sys
sys.path.append('../utils')
from utils import perf

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
perf('../../dumps/dataset_variations/16K-19-05-T3.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.905
Accuracy on test set: 0.907


In [4]:
perf('../../dumps/dataset_variations/16K-19-05-T3.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.945
Accuracy on test set: 0.945


## Different thresholds


### Gaussian

In [5]:
perf('../../dumps/dataset_variations/16K-19-05-T1.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.772
Accuracy on test set: 0.768


In [6]:
perf('../../dumps/dataset_variations/16K-19-05-T2.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.684
Accuracy on test set: 0.675


In [7]:
perf('../../dumps/dataset_variations/16K-19-05-T4.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.951
Accuracy on test set: 0.953


In [8]:
perf('../../dumps/dataset_variations/16K-19-05-T5.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.974
Accuracy on test set: 0.974


### Bernoulli

In [9]:
perf('../../dumps/dataset_variations/16K-19-05-T1.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.880
Accuracy on test set: 0.881


In [10]:
perf('../../dumps/dataset_variations/16K-19-05-T2.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.940
Accuracy on test set: 0.942


In [11]:
perf('../../dumps/dataset_variations/16K-19-05-T4.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.988
Accuracy on test set: 0.987


In [12]:
perf('../../dumps/dataset_variations/16K-19-05-T5.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.992
Accuracy on test set: 0.993


## Errors considered as positive result from detector


### Gaussian

In [13]:
perf('../../dumps/dataset_variations/16K-22-05-error.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.901
Accuracy on test set: 0.900


### Bernoulli

In [14]:
perf('../../dumps/dataset_variations/16K-22-05-error.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.945
Accuracy on test set: 0.945


## Check influence of each detector

### Gaussian

In [15]:
perf('../../dumps/dataset_variations/16K-22-05-DIE.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.906
Accuracy on test set: 0.894


In [16]:
perf('../../dumps/dataset_variations/16K-22-05-CISCO.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.893
Accuracy on test set: 0.889


In [17]:
perf('../../dumps/dataset_variations/16K-22-05-MANALYZE.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.909
Accuracy on test set: 0.910


In [18]:
perf('../../dumps/dataset_variations/16K-22-05-PEID.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.919
Accuracy on test set: 0.917


In [19]:
perf('../../dumps/dataset_variations/16K-22-05-PEFRAME.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.936
Accuracy on test set: 0.940


### Bernoulli

In [20]:
perf('../../dumps/dataset_variations/16K-22-05-DIE.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.943
Accuracy on test set: 0.942


In [21]:
perf('../../dumps/dataset_variations/16K-22-05-CISCO.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.987
Accuracy on test set: 0.985


In [22]:
perf('../../dumps/dataset_variations/16K-22-05-MANALYZE.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.941
Accuracy on test set: 0.940


In [29]:
perf('../../dumps/dataset_variations/16K-22-05-PEID.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.990
Accuracy on test set: 0.987


In [30]:
perf('../../dumps/dataset_variations/16K-22-05-PEFRAME.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.991
Accuracy on test set: 0.992


## Only boolean values

### Gaussian

In [25]:
perf('../../dumps/dataset_variations/16K-19-05-T3.csv','gaussian',True)

Most important : 

Accuracy on training set: 0.916
Accuracy on test set: 0.915
---------------

All features : 

Accuracy on training set: 0.979
Accuracy on test set: 0.974


### Bernoulli

In [26]:
perf('../../dumps/dataset_variations/16K-19-05-T3.csv','bernoulli',True)

Most important : 

Accuracy on training set: 0.913
Accuracy on test set: 0.914
---------------

All features : 

Accuracy on training set: 0.948
Accuracy on test set: 0.947


## Agreement on same packer

### Gaussian

In [27]:
perf('../../dumps/dataset_variations/16K-22-05-agreement.csv','gaussian',False)

Most important : 

Accuracy on training set: 0.908
Accuracy on test set: 0.908


### Bernoulli

In [31]:
perf('../../dumps/dataset_variations/16K-22-05-agreement.csv','bernoulli',False)

Most important : 

Accuracy on training set: 0.947
Accuracy on test set: 0.947


## Test with Thomas datasets

In [28]:
gt = pd.read_csv("../../dumps/thomas_datasets/2019-08.Merged_thomas.csv")
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

tree = GaussianNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

tree = BernoulliNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

for col in cols:
    gt = gt.drop(gt[gt[col] < 0 ].index)
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = MultinomialNB()

tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.886
Accuracy on test set: 0.890
Accuracy on training set: 0.903
Accuracy on test set: 0.901
Accuracy on training set: 0.248
Accuracy on test set: 0.244


In [29]:
gt = pd.read_csv("../../dumps/thomas_datasets/2019-09.Merged_thomas.csv")
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

tree = GaussianNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

tree = BernoulliNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

for col in cols:
    gt = gt.drop(gt[gt[col] < 0 ].index)
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = MultinomialNB()
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.989
Accuracy on test set: 0.986
Accuracy on training set: 0.933
Accuracy on test set: 0.930
Accuracy on training set: 0.223
Accuracy on test set: 0.218
