### Subset Distributions

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from wakeful import log_munger, metrics, virus_total, pipelining, preprocessing
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from bat.dataframe_to_matrix import DataFrameToMatrix
%matplotlib inline

#### dnscat2

In [2]:
%ls data/*.h5

data/dnscat2_2017_12_31_conn_test.h5
data/dnscat2_2017_12_31_conn_train.h5
data/dnscat2_2017_12_31_dns_test.h5
data/dnscat2_2017_12_31_dns_train.h5
data/iodine_forwarded_2017_12_31_conn_test.h5
data/iodine_forwarded_2017_12_31_conn_train.h5
data/iodine_forwarded_2017_12_31_dns_test.h5
data/iodine_forwarded_2017_12_31_dns_train.h5
data/iodine_raw_2017_12_31_conn_test.h5
data/iodine_raw_2017_12_31_conn_train.h5
data/iodine_raw_2017_12_31_dns_test.h5
data/iodine_raw_2017_12_31_dns_train.h5


In [3]:
df_train = log_munger.hdf5_to_df('dnscat2_2017_12_31_conn_train', './data')
df_test = log_munger.hdf5_to_df('dnscat2_2017_12_31_conn_test', './data')
print('train value counts:\n', df_train.label.value_counts())
print('test value counts:\n', df_test.label.value_counts())

train value counts:
 1    25660
0    20397
Name: label, dtype: int64
test value counts:
 0    20658
1     1509
Name: label, dtype: int64


In [4]:
'label' in df_train.columns, 'label' in df_test.columns

(True, True)

In [5]:
y_train = df_train.pop('label')
X_train = df_train
to_matrix = DataFrameToMatrix()
X_train_mat = to_matrix.fit_transform(X_train)


Normalizing column orig_bytes...
Normalizing column orig_ip_bytes...
Normalizing column orig_pkts...
Normalizing column resp_bytes...
Normalizing column resp_ip_bytes...
Normalizing column resp_pkts...
Normalizing column pcr...


In [6]:
y_test = df_test.pop('label')
X_test = df_test
to_matrix = DataFrameToMatrix()
X_test_mat = to_matrix.fit_transform(X_test)

Normalizing column orig_bytes...
Normalizing column orig_ip_bytes...
Normalizing column orig_pkts...
Normalizing column resp_bytes...
Normalizing column resp_ip_bytes...
Normalizing column resp_pkts...
Normalizing column pcr...


### PCA Feature Reduction

In [10]:
pca = PCA(n_components=6)
pca.fit(X_train_mat)
print(sum(pca.explained_variance_ratio_), '=', pca.explained_variance_ratio_)
print(pca.n_components_)
X_test_mat_pca = pca.fit_transform(X_test_mat)

0.999994366965 = [  5.18925032e-01   3.27810657e-01   9.43712562e-02   5.80887481e-02
   7.05145935e-04   9.35283651e-05]
6


In [11]:
X_test_mat_pca.shape

(22167, 6)

### SequentialFeature Reduction

In [13]:
df_train.columns

Index(['local_orig', 'local_resp', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts',
       'resp_bytes', 'resp_ip_bytes', 'resp_pkts', 'pcr', 'is_ipv4_host',
       'is_ipv6_host', 'is_ipv4_resp', 'is_ipv6_resp'],
      dtype='object')

In [14]:
df_test.columns

Index(['local_orig', 'local_resp', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts',
       'resp_bytes', 'resp_ip_bytes', 'resp_pkts', 'pcr', 'is_ipv4_host',
       'is_ipv6_host', 'is_ipv4_resp', 'is_ipv6_resp'],
      dtype='object')

In [12]:
pipelining.feature_selection_pipeline(train_df=df_train, test_df=df_test)

KeyError: 'label'

In [None]:
sns.pairplot(df_train_conn_dnscat2, hue='label')

### Distributions

In [None]:
%ls ./data

In [None]:
keys = [
        ('iodine-forwarded-2017-12-31-conn-test', 'iodine-forwarded-2017-12-31-conn-train'),
        ('iodine-raw-2017-12-31-conn-test', 'iodine-raw-2017-12-31-conn-train'),
        ('dnscat2-2017-12-31-conn-test', 'dnscat2-2017-12-31-conn-train'),
        ('iodine-forwarded-2017-12-31-dns-test', 'iodine-forwarded-2017-12-31-dns-train'),
        ('iodine-raw-2017-12-31-dns-test', 'iodine-raw-2017-12-31-dns-train'),
        ('dnscat2-2017-12-31-dns-test', 'dnscat2-2017-12-31-dns-train'),]

In [None]:
train_key = 'iodine-forwarded-2017-12-31-conn-train'
test_key = 'iodine-forwarded-2017-12-31-conn-test'
data_dir='./data'

In [None]:
train_df = log_munger.hdf5_to_df(key, data_dir)
test_df = log_munger.hdf5_to_df(key, data_dir)

In [None]:
df.columns

In [None]:
pipelining.feature_selection_pipeline(train_df=train_df, test_df=test_df)

In [None]:
df = df[['local_orig', 'local_resp', 'orig_ip_bytes', 'pcr', 'label']]

In [None]:
df = df.dropna(axis=0, how='any')

In [None]:
sns.pairplot(df, hue='label')