In [2]:
%load_ext autoreload
%autoreload

import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

sys.path.append('../../utils')
from utils import thomas_parser
from toBoolean import convert

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Comparison between different kind of datasets

Let's first see performances on a default dataset, which means a threshold of 3/5 detectors, a limit of 8000 malwares,  all features considered, all detectors considered and errors understood as non-packed result.

In [3]:
gt = pd.read_csv('../../../dumps/dataset_variations/control_8000_false_3.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.993
Accuracy on test set: 0.990


#### Different thresholds

In [4]:
gt = pd.read_csv('../../../dumps/dataset_variations/default_1.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.978
Accuracy on test set: 0.975


In [5]:
gt = pd.read_csv('../../../dumps/dataset_variations/default_2.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.988


In [6]:
gt = pd.read_csv('../../../dumps/dataset_variations/default_4.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.987


In [7]:
gt = pd.read_csv('../../../dumps/dataset_variations/default_5.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.987


#### Errors considered as positive result from detector

In [8]:
gt = pd.read_csv('../../../dumps/dataset_variations/error_as_packed.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.988


#### Check influence of each detector

In [9]:
gt = pd.read_csv('../../../dumps/dataset_variations/not_die.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.992
Accuracy on test set: 0.988


In [10]:
gt = pd.read_csv('../../../dumps/dataset_variations/not_cisco.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.994
Accuracy on test set: 0.990


In [11]:
gt = pd.read_csv('../../../dumps/dataset_variations/not_manalyze.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.983


In [12]:
gt = pd.read_csv('../../../dumps/dataset_variations/not_peid.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.982


In [13]:
gt = pd.read_csv('../../../dumps/dataset_variations/not_peframe.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.988


#### Only boolean values (either most important or all features considered)

In [14]:
gt = pd.read_csv('../../../dumps/dataset_variations/control_8000_false_3.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
data = convert(data, False)
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.978
Accuracy on test set: 0.973


In [15]:
gt = pd.read_csv('../../../dumps/dataset_variations/control_8000_false_3.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
data = convert(data, True)
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.987
Accuracy on test set: 0.985


#### Agreement on same packer

In [16]:
gt = pd.read_csv('../../../dumps/dataset_variations/control_8000_false_3.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
data = convert(data, True)
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.987
Accuracy on test set: 0.985


#### Test with Thomas datasets

In [17]:
gt = pd.read_csv(thomas_parser("../../../dumps/thomas_datasets/2019-08.Merged"))
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.987
Accuracy on test set: 0.986


In [18]:
gt = pd.read_csv(thomas_parser("../../../dumps/thomas_datasets/2019-09.Merged"))
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

tree = DecisionTreeClassifier(max_depth=7,min_samples_split=10,min_samples_leaf=7,random_state=0)
tree.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(data_test, target_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.987
