# Capstone Phase 2 - Modeling  
The previous notebook (capstone-phase1) was getting a bit long to run comfortably, so let's start a new one for ML.


#### Import some libraries we'll need

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from zipfile import ZipFile

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### Load in the training and testing data sets.

In [2]:
with ZipFile('./Data/UNSW_NB15 training and testing sets.zip') as datasets_zip:
    # open each file and read in to a DataFrame
    with datasets_zip.open('UNSW_NB15_training-set.csv') as training_file:
        training_data = pd.read_csv(training_file, index_col='id')
    with datasets_zip.open('UNSW_NB15_testing-set.csv') as testing_file:
        testing_data = pd.read_csv(testing_file, index_col='id')

In [66]:
training_data.shape

(175341, 44)

In [67]:
testing_data.shape

(82332, 44)

## Apply transformations determined during exploration phase.

In [7]:
# split labels and categories from the training data
training = training_data.drop(['label', 'attack_cat'], axis=1)
training_labels = training_data['label'].copy()
training_cats = training_data['attack_cat'].copy()

# whatever we do to the training set should also be done to the testing set
testing = testing_data.drop(['label', 'attack_cat'], axis=1)
testing_labels = testing_data['label'].copy()
testing_cats = testing_data['attack_cat'].copy()

In [21]:
# expand categorical features
training_data_onehot = pd.get_dummies(training, columns=['proto', 'service', 'state'], prefix=['proto', 'service', 'state'])
#training_data_onehot.head()

# one-hot the testing set as well
testing_data_onehot = pd.get_dummies(testing, columns=['proto', 'service', 'state'], prefix=['proto', 'service', 'state'])

# convert target classes to binary vectors
binarizer = preprocessing.LabelBinarizer()
binarized_cats = binarizer.fit_transform(training_cats)
print(binarizer.classes_)

# and for testing classes
test_binarizer = preprocessing.LabelBinarizer()
bin_test_cats = test_binarizer.fit_transform(testing_cats)
print(test_binarizer.classes_)

['Analysis' 'Backdoor' 'DoS' 'Exploits' 'Fuzzers' 'Generic' 'Normal'
 'Reconnaissance' 'Shellcode' 'Worms']
['Analysis' 'Backdoor' 'DoS' 'Exploits' 'Fuzzers' 'Generic' 'Normal'
 'Reconnaissance' 'Shellcode' 'Worms']


Let's also try a simple integer mapping for the classes, it might work better for some methods

In [80]:
y_train = training_cats.map({'Normal':0, 'Generic':1, 'Exploits':2, 'Fuzzers':3, 'DoS':4, 
                           'Reconnaissance':5, 'Analysis':6, 'Backdoor':7, 'Shellcode':8, 'Worms':9}).astype(int)
y_test = testing_cats.map({'Normal':0, 'Generic':1, 'Exploits':2, 'Fuzzers':3, 'DoS':4, 
                           'Reconnaissance':5, 'Analysis':6, 'Backdoor':7, 'Shellcode':8, 'Worms':9}).astype(int)
#y_test.head()

In [30]:
print(training_data_onehot.shape)
print(testing_data_onehot.shape)

(175341, 194)
(82332, 190)


The testing set doesn't appear to have all of the categorical values seen in the training set....

In [56]:
missing_test_cols = [x for x in list(training_data_onehot.columns) if x not in list(testing_data_onehot.columns)]
print(missing_test_cols)

['proto_icmp', 'proto_rtp', 'state_ECO', 'state_PAR', 'state_URN', 'state_no']


In [60]:
print('Training protos: ', training['proto'].nunique())
print('Testing protos: ', testing['proto'].nunique())
print('Training services: ', training['service'].nunique())
print('Testing services: ', testing['service'].nunique())
print('Training states: ', training['state'].nunique())
print('Testing states: ', testing['state'].nunique())

Training protos:  133
Testing protos:  131
Training services:  13
Testing services:  13
Training states:  9
Testing states:  7


In [64]:
missing_protos = set(training['proto'].unique())-set(testing['proto'].unique())
missing_states = set(training['state'].unique())-set(testing['state'].unique())
print(missing_protos, missing_states)

{'icmp', 'rtp'} {'ECO', 'PAR', 'no', 'URN'}


In [63]:
print(training['state'].unique())
print(testing['state'].unique())

['FIN' 'INT' 'CON' 'ECO' 'REQ' 'RST' 'PAR' 'URN' 'no']
['INT' 'FIN' 'REQ' 'ACC' 'CON' 'RST' 'CLO']


It seems the training set doesn't have any instances of 'ACC' and 'CLO' states....

In [31]:
# apply standard scaler to the data
training_scaler = preprocessing.StandardScaler().fit(training_data_onehot)
training_data_standardized = training_scaler.transform(training_data_onehot)

# and the testing set
testing_scaler = preprocessing.StandardScaler().fit(testing_data_onehot)
testing_data_standardized = testing_scaler.transform(testing_data_onehot)

In [32]:
# convert standardized data back to DataFrame
training_data_std = pd.DataFrame(training_data_standardized, columns=training_data_onehot.columns)
# and testing data
testing_data_std = pd.DataFrame(testing_data_standardized, columns=testing_data_onehot.columns)

In [33]:
training_data_std.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_ssl,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,-0.191029,-0.104456,-0.135769,-0.049134,-0.102726,-0.576371,0.703839,1.5781,-0.389897,-0.2737,...,-0.017874,-0.284764,-0.008273,1.119382,-0.940239,-0.002388,-0.10717,-0.021762,-0.002388,-0.002388
1,-0.109485,-0.046014,0.172599,-0.04641,0.188544,-0.576345,-1.141901,1.560002,-0.389928,-0.069233,...,-0.017874,-0.284764,-0.008273,1.119382,-0.940239,-0.002388,-0.10717,-0.021762,-0.002388,-0.002388
2,0.040699,-0.089845,-0.026933,-0.048527,-0.012133,-0.576734,-1.141901,1.560002,-0.389964,-0.252044,...,-0.017874,-0.284764,-0.008273,1.119382,-0.940239,-0.002388,-0.10717,-0.021762,-0.002388,-0.002388
3,0.049729,-0.060624,-0.063212,-0.047016,-0.098563,-0.576737,-1.141901,1.560002,-0.389958,-0.275821,...,-0.017874,-0.284764,-0.008273,1.119382,-0.940239,-0.002388,-0.10717,-0.021762,-0.002388,-0.002388
4,-0.140417,-0.075235,-0.11763,-0.047554,-0.102057,-0.576617,0.723268,1.560002,-0.389927,-0.275561,...,-0.017874,-0.284764,-0.008273,1.119382,-0.940239,-0.002388,-0.10717,-0.021762,-0.002388,-0.002388


In [34]:
testing_data_std.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_snmp,service_ssh,service_ssl,state_ACC,state_CLO,state_CON,state_FIN,state_INT,state_REQ,state_RST
0,-0.213727,-0.124455,-0.151816,-0.043684,-0.087369,0.057181,0.71944,-0.820395,0.643913,-0.263498,...,-0.018771,-0.049839,-0.019092,-0.00697,-0.003485,-0.304403,-0.956561,1.187424,-0.151277,-0.003485
1,-0.213728,-0.124455,-0.151816,-0.036308,-0.087369,0.286565,0.71944,-0.820395,4.539351,-0.263498,...,-0.018771,-0.049839,-0.019092,-0.00697,-0.003485,-0.304403,-0.956561,1.187424,-0.151277,-0.003485
2,-0.213729,-0.124455,-0.151816,-0.040351,-0.087369,0.791209,0.71944,-0.820395,4.391459,-0.263498,...,-0.018771,-0.049839,-0.019092,-0.00697,-0.003485,-0.304403,-0.956561,1.187424,-0.151277,-0.003485
3,-0.213729,-0.124455,-0.151816,-0.04133,-0.087369,0.566923,0.71944,-0.820395,2.977031,-0.263498,...,-0.018771,-0.049839,-0.019092,-0.00697,-0.003485,-0.304403,-0.956561,1.187424,-0.151277,-0.003485
4,-0.213728,-0.124455,-0.151816,-0.034187,-0.087369,0.11835,0.71944,-0.820395,4.369219,-0.263498,...,-0.018771,-0.049839,-0.019092,-0.00697,-0.003485,-0.304403,-0.956561,1.187424,-0.151277,-0.003485


## Load feature subsets

In [35]:
subsets = pd.read_pickle("./data/subsets.pkl")

In [36]:
subsets.head()

Unnamed: 0,Info Gain,ANOVA F-val,Var Thresh,MAD,Corr
0,sbytes,ct_dst_sport_ltm,spkts,sload,dur
1,smean,ct_dst_src_ltm,dpkts,rate,spkts
2,sload,ct_srv_dst,sbytes,dload,dpkts
3,dbytes,ct_src_dport_ltm,dbytes,sbytes,rate
4,rate,ct_srv_src,rate,dbytes,sttl


Make a subset of the Info Gain features

In [39]:
ig_cols = subsets['Info Gain'] # pandas Series
select_cols = training_data_std[ig_cols]

# test set
sel_test_cols = testing_data_std[ig_cols]

In [40]:
train_std_ig = select_cols.copy()
test_std_ig = sel_test_cols.copy()

In [41]:
train_std_ig.head()

Unnamed: 0,sbytes,smean,sload,dbytes,rate,dmean,dur,ct_dst_sport_ltm,ct_srv_dst,ct_state_ttl,dttl,dload,ct_dst_src_ltm,ct_srv_src,dinpkt,sttl,ct_src_dport_ltm,dpkts,ct_dst_ltm,sinpkt
0,-0.049134,-0.458048,-0.389897,-0.102726,-0.576371,-0.31424,-0.191029,-0.554373,-0.753074,-1.366486,1.5781,-0.2737,-0.705529,-0.775991,-0.080885,0.703839,-0.544736,-0.135769,-0.645013,-0.132788
1,-0.04641,-0.414076,-0.389928,0.188544,-0.576345,3.800869,-0.109485,-0.554373,-0.288257,-0.318711,1.560002,-0.069233,-0.614256,3.147666,-0.073735,-1.141901,-0.544736,0.172599,-0.645013,-0.129251
2,-0.048527,-0.443391,-0.389964,-0.012133,-0.576734,2.709185,0.040699,-0.554373,-0.288257,-0.318711,1.560002,-0.252044,-0.522983,-0.215468,0.014711,-1.141901,-0.544736,-0.026933,-0.520827,-0.104126
3,-0.047016,-0.414076,-0.389958,-0.098563,-0.576737,-0.232945,0.049729,-0.554373,-0.753074,-0.318711,1.560002,-0.275821,-0.522983,-0.775991,0.002046,-1.141901,-0.544736,-0.063212,-0.520827,-0.115034
4,-0.047554,-0.40919,-0.389927,-0.102057,-0.576617,-0.306498,-0.140417,-0.554373,2.779535,-0.318711,1.560002,-0.275561,2.854115,3.147666,-0.012721,0.723268,-0.420468,-0.11763,-0.520827,-0.129549


In [42]:
test_std_ig.head()

Unnamed: 0,sbytes,smean,sload,dbytes,rate,dmean,dur,ct_dst_sport_ltm,ct_srv_dst,ct_state_ttl,dttl,dload,ct_dst_src_ltm,ct_srv_src,dinpkt,sttl,ct_src_dport_ltm,dpkts,ct_dst_ltm,sinpkt
0,-0.043684,0.520319,0.643913,-0.087369,0.057181,-0.475371,-0.213727,-0.450186,-0.64419,0.591021,-0.820395,-0.263498,-0.477994,-0.680474,-0.094169,0.71944,-0.468312,-0.151816,-0.56366,-0.122179
1,-0.036308,3.556716,4.539351,-0.087369,0.286565,-0.475371,-0.213728,-0.450186,-0.64419,0.591021,-0.820395,-0.263498,-0.477994,-0.680474,-0.094169,0.71944,-0.468312,-0.151816,-0.56366,-0.12218
2,-0.040351,1.892214,4.391459,-0.087369,0.791209,-0.475371,-0.213729,-0.450186,-0.554273,0.591021,-0.820395,-0.263498,-0.390391,-0.590304,-0.094169,0.71944,-0.468312,-0.151816,-0.56366,-0.12218
3,-0.04133,1.48928,2.977031,-0.087369,0.566923,-0.475371,-0.213729,-0.450186,-0.554273,0.591021,-0.820395,-0.263498,-0.390391,-0.590304,-0.094169,0.71944,-0.349115,-0.151816,-0.444868,-0.12218
4,-0.034187,4.42974,4.369219,-0.087369,0.11835,-0.475371,-0.213728,-0.450186,-0.554273,0.591021,-0.820395,-0.263498,-0.390391,-0.590304,-0.094169,0.71944,-0.349115,-0.151816,-0.444868,-0.122179


## K-nearest neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [81]:
# start with default settings; k=5, distance=minkowski
knn_model = KNeighborsClassifier()
knn_model.fit(train_std_ig, y_train)#binarized_cats)

KNeighborsClassifier()

In [82]:
# predict classes for the test set
y_pred = knn_model.predict(test_std_ig)

Let's see how we did

In [44]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [86]:
con_mat = confusion_matrix(y_test, y_pred)#bin_test_cats, y_pred)
print(con_mat)

[[27061   368   991  7597    55   130   774    10    14     0]
 [  689 15887   408  1828    42     9     2     0     3     3]
 [  495   275  8044   682   924   264   252   187     9     0]
 [ 1768   197   667  2934   269    31    91   100     5     0]
 [  209   180  2036   229  1022    50   198   157     8     0]
 [  332    18   900   684   127  1400    20    14     1     0]
 [   55    86   264    29   123     1    69    50     0     0]
 [   47    82   228    42    99     5    36    44     0     0]
 [   47     9   117    89    12    75     0     0    29     0]
 [    3     1    25     6     2     0     0     0     0     7]]


In [87]:
class_rep = classification_report(y_test, y_pred)#bin_test_cats, y_pred)
print(class_rep)

              precision    recall  f1-score   support

           0       0.88      0.73      0.80     37000
           1       0.93      0.84      0.88     18871
           2       0.59      0.72      0.65     11132
           3       0.21      0.48      0.29      6062
           4       0.38      0.25      0.30      4089
           5       0.71      0.40      0.51      3496
           6       0.05      0.10      0.07       677
           7       0.08      0.08      0.08       583
           8       0.42      0.08      0.13       378
           9       0.70      0.16      0.26        44

    accuracy                           0.69     82332
   macro avg       0.49      0.38      0.40     82332
weighted avg       0.76      0.69      0.71     82332



I may need to rethink the Binarized Labels for this... Confusion Matrix and Classification Report probably need a simple integer mapping instead....

In [85]:
accuracy = accuracy_score(y_test, y_pred)#bin_test_cats, y_pred)
print(accuracy)

0.6862094932711461


68.62% with 5 neighbors, not great results

In [88]:
knn_model2 = KNeighborsClassifier(n_neighbors=7)
knn_model2.fit(train_std_ig, y_train)#binarized_cats)
y_pred2 = knn_model2.predict(test_std_ig)

In [90]:
con_mat = confusion_matrix(y_test, y_pred2)#bin_test_cats, y_pred)
print(con_mat)
class_rep = classification_report(y_test, y_pred2)#bin_test_cats, y_pred)
print(class_rep)
acc2 = accuracy_score(y_test, y_pred2)#bin_test_cats, y_pred)
print(acc2)

[[27017   395   830  7659    52   165   857     9    16     0]
 [  628 16189   413  1580    42    11     1     0     3     4]
 [  379   217  8389   678   768   326   242   119    13     1]
 [ 1683   197   722  3007   240    42    87    80     4     0]
 [  175   131  2378   242   830    74   167    86     6     0]
 [  136    16   748   781   103  1688    16     7     1     0]
 [   42    81   297    27   118     2    70    40     0     0]
 [   32    81   254    41    94     7    36    38     0     0]
 [   42    13   101    94     6    86     0     0    36     0]
 [    2     1    25     7     2     0     0     0     0     7]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.80     37000
           1       0.93      0.86      0.89     18871
           2       0.59      0.75      0.66     11132
           3       0.21      0.50      0.30      6062
           4       0.37      0.20      0.26      4089
           5       0.70      0.48      0.57 

69.56% with 7 neighbors, very slighly better...maybe fewer will work better? or maybe we need significantly more?

In [91]:
knn_model3 = KNeighborsClassifier(n_neighbors=3)
knn_model3.fit(train_std_ig, y_train)#binarized_cats)
y_pred3 = knn_model3.predict(test_std_ig)

In [92]:
con_mat = confusion_matrix(y_test, y_pred3)#bin_test_cats, y_pred)
print(con_mat)
class_rep = classification_report(y_test, y_pred3)#bin_test_cats, y_pred)
print(class_rep)
acc3 = accuracy_score(y_test, y_pred3)#bin_test_cats, y_pred)
print(acc3)

[[27078   370   964  7476    61   134   882    23    11     1]
 [  737 15970   323  1760    56    12     2     7     2     2]
 [  533   296  7303   931   743   573   305   425    20     3]
 [ 1812   204   579  3002   159    45   156    98     6     1]
 [  210   160  1690   406   792   162   200   457    12     0]
 [  259    19   544   879   269  1445    14    55     2    10]
 [   49    87   203    79    79     9   111    60     0     0]
 [   54    87   185    91    63     6    71    25     1     0]
 [   59    13    80    82    22    89     0     0    31     2]
 [    3     1    19     5     6     3     0     0     0     7]]
              precision    recall  f1-score   support

           0       0.88      0.73      0.80     37000
           1       0.93      0.85      0.89     18871
           2       0.61      0.66      0.63     11132
           3       0.20      0.50      0.29      6062
           4       0.35      0.19      0.25      4089
           5       0.58      0.41      0.48 

67.73% with 3 neighbors, this model probably just isn't well suited to this data set  
Let's try one more, there are 10 classes to predict, let's try 11 neighbors

In [93]:
knn_model4 = KNeighborsClassifier(n_neighbors=11)
knn_model4.fit(train_std_ig, y_train)#binarized_cats)
y_pred4 = knn_model4.predict(test_std_ig)

In [94]:
con_mat = confusion_matrix(y_test, y_pred4)#bin_test_cats, y_pred)
print(con_mat)
class_rep = classification_report(y_test, y_pred4)#bin_test_cats, y_pred)
print(class_rep)
acc4 = accuracy_score(y_test, y_pred4)#bin_test_cats, y_pred)
print(acc4)

[[26929   371   842  7785    55   165   829     6    18     0]
 [  630 16136   442  1593    48    10     3     2     3     4]
 [  355   173  8420   724   872   280   186   107    14     1]
 [ 1596   178   721  3110   273    21   102    58     3     0]
 [  155   118  2326   251   985    58   102    89     5     0]
 [  127    10   710   775   135  1723     7     9     0     0]
 [   56    75   294    20   130     1    72    29     0     0]
 [   36    75   272    30    92     4    48    26     0     0]
 [   34    10    96    94    14    89     0     0    41     0]
 [    3     1    22     8     0     4     0     0     0     6]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.80     37000
           1       0.94      0.86      0.90     18871
           2       0.60      0.76      0.67     11132
           3       0.22      0.51      0.30      6062
           4       0.38      0.24      0.29      4089
           5       0.73      0.49      0.59 

69.77%, again not much improvement
We either need different features, or more likely a more powerful model

Checking the results shows where the misclassificatons are happening most often:  
- Normal, ok  
- Generic, ok  
- Exploits, ok, highest misclass=DoS  
- Fuzzers, highest misclass=Normal  
- DoS, not ok, highest misclass=Exploits  
- Reconnaissance, not great, highest misclass=Exploits  
- Analysis, poor, highest misclass=Exploits, DoS  
- Backdoor, poor, highest miclass=Exploits  
- Shellcode, poor, highest misclass=Exploits, Fuzzers  
- Worms, poor, highest misclass=Exploits  

This is probably due to the relatively small number of some attack categories.  Those classes are often misclassified as *Exploits*.