In [None]:
# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory
copyfile(src = "../input/packages/mlwpy.py", dst = "../working/mlwpy.py")

In [None]:
from mlwpy import *
%matplotlib inline

# Preparing the dataset

In [None]:
benign = pd.read_csv('../input/nbaiot-dataset/1.benign.csv')
mirai_ack = pd.read_csv('../input/nbaiot-dataset/1.mirai.ack.csv')
mirai_scan = pd.read_csv('../input/nbaiot-dataset/1.mirai.scan.csv')
mirai_syn = pd.read_csv('../input/nbaiot-dataset/1.mirai.syn.csv')
mirai_udp = pd.read_csv('../input/nbaiot-dataset/1.mirai.udp.csv')
mirai_udp_plain = pd.read_csv('../input/nbaiot-dataset/1.mirai.udpplain.csv')
gafgyt_combo = pd.read_csv('../input/nbaiot-dataset/1.gafgyt.combo.csv')
gafgyt_junk = pd.read_csv('../input/nbaiot-dataset/1.gafgyt.junk.csv')
gafgyt_scan = pd.read_csv('../input/nbaiot-dataset/1.gafgyt.scan.csv')
gafgyt_tcp = pd.read_csv('../input/nbaiot-dataset/1.gafgyt.tcp.csv')
gafgyt_udp = pd.read_csv('../input/nbaiot-dataset/1.gafgyt.udp.csv')

# Using half the of benign's population for sample to match the number of 
# instances of mirai_ack which is a bigger dataset
benign = benign.sample(frac=0.50, replace=False)

mirai_ack = mirai_ack.sample(frac=0.24, replace=False)
mirai_scan = mirai_scan.sample(frac=0.22, replace=False)
mirai_syn = mirai_syn.sample(frac=0.2, replace=False)
mirai_udp = mirai_udp.sample(frac=0.1, replace=False)
mirai_udp_plain = mirai_udp_plain.sample(frac=0.3, replace=False)

gafgyt_combo = gafgyt_combo.sample(frac=0.4, replace=False)
gafgyt_junk = gafgyt_junk.sample(frac=0.8, replace=False)
gafgyt_scan = gafgyt_scan.sample(frac=0.8, replace=False)
gafgyt_tcp = gafgyt_tcp.sample(frac=0.25, replace=False)
gafgyt_udp = gafgyt_udp.sample(frac=0.23, replace=False)


benign['type']='benign'
mirai_ack['type']='mirai_ack'
mirai_scan['type']='mirai_scan'
mirai_syn['type'] = 'mirai_syn'
mirai_udp['type'] = 'mirai_udp'
mirai_udp_plain['type'] = 'mirai_udp_plain'

gafgyt_combo['type'] = 'gafgyt_combo'
gafgyt_junk['type'] = 'gafgyt_junk'
gafgyt_scan['type'] = 'gafgyt_scan'
gafgyt_tcp['type'] = 'gafgyt_tcp'
gafgyt_udp['type'] = 'gafgyt_udp'

data = pd.concat([benign, 
                  mirai_ack, mirai_scan, mirai_syn, mirai_udp, mirai_udp_plain,
                 gafgyt_combo, gafgyt_junk, gafgyt_scan, gafgyt_tcp, gafgyt_udp], 
                 axis=0, sort=False, ignore_index=True)

In [None]:
#Show how many instance of each class in the dataset
data.groupby('type')['type'].count()

# Shuffle the data

In [None]:
#Shuffling rows of the dataframe
sampler = np.random.permutation(len(data))
data = data.take(sampler)
data.head()

# Dataset Normalisation

In [None]:
features = data.drop(['type'], axis=1)
target = data.filter(['type'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_st = scaler.fit_transform(features)

In [None]:
# Encode target column data
target.type = target.type.replace('benign', 0).replace('mirai_ack', 1).replace('mirai_scan', 2).replace('mirai_syn', 3).replace('mirai_udp', 4).replace('mirai_udp_plain', 5).replace('gafgyt_combo', 6).replace('gafgyt_junk', 7).replace('gafgyt_scan', 8).replace('gafgyt_tcp', 9).replace('gafgyt_udp', 10)

In [None]:
# Create dataset
data_v2 = {'data': data_st, 'target': np.array(target['type'].values)} 

# Split Train and Test Data

In [None]:
# Split data into Train/test 25% datasets
(iot_train, iot_test,
 iot_train_tgt, iot_test_tgt) = skms.train_test_split(data_v2['data'], 
                                                 data_v2['target'], 
                                                 test_size=0.25)

# Train a KNN and a Naive Bayes Model to Classify Attacks

In [None]:
classifiers = {'LogReg(1)': linear_model.LogisticRegression(max_iter=1000),
         'LogReg(2)': linear_model.SGDClassifier(loss='log',
                                                max_iter=1000),
         
         'QDA': discriminant_analysis.QuadraticDiscriminantAnalysis(),
         'LDA': discriminant_analysis.LinearDiscriminantAnalysis(),
         'GNB': naive_bayes.GaussianNB(),
         
         #'SVC(1)': svm.SVC(kernel="linear"),
         #'SVC(2)': svm.LinearSVC(),
         
         'DTC': tree.DecisionTreeClassifier(),
         '5NN-C': neighbors.KNeighborsClassifier(),
         '10NN-C': neighbors.KNeighborsClassifier(n_neighbors=10)}

In [None]:
for name, model in classifiers.items():
    fit = model.fit(iot_train, iot_train_tgt)
    preds = fit.predict(iot_test)

    knn_score =metrics.accuracy_score(iot_test_tgt, preds)
    print("{:>4s}: {:5.2f}".format(name, knn_score))

# Confusion Matrix

In [None]:
fig, axes = plt.subplots(2,4, figsize=(30, 15),
                        sharex=True, sharey=True)

for ax, (name, model) in zip(axes.flat, classifiers.items()):
    preds = (model.fit(iot_train, iot_train_tgt)
            .predict(iot_test))
    cm = metrics.confusion_matrix(iot_test_tgt, preds)
    sns.heatmap(cm, annot=True, cbar=False, ax=ax)
    ax.set_title(name)
    
axes[0,0].set_ylabel('Actual')
axes[1,0].set_xlabel('Predicted')

# ROC and AUC

In [None]:
#Lift one vs rest. There's also a lift one vs rest with all curves in one graph below

iot_multi_tgt = skpre.label_binarize(data_v2['target'], [0,1,2,3,4,5,6,7,8,9, 10])


(iotmulti_train_ftrs, iotmulti_test_ftrs,
iotmulti_train_tgt, iotmulti_test_tgt) = skms.train_test_split(data_v2['data'], iot_multi_tgt, test_size=.33)

is_first = data_v2['target'] == 1
tts_1c = skms.train_test_split(data_v2['data'], is_first, test_size=.33)

(iot_1c_train_ftrs, iot_1c_test_ftrs,
iot_1c_train_tgt, iot_1c_test_tgt) = tts_1c

for name, model in classifiers.items():
    prob_true = (model.fit(iot_1c_train_ftrs, iot_1c_train_tgt)
                .predict_proba(iot_1c_test_ftrs)[:,1])
    
    #negate because we want big values first
    myorder = np.argsort(-prob_true)
    
    #cumulative sum then to percent (last value is total)
    realpct_myorder = iot_1c_test_tgt[myorder].cumsum()
    realpct_myorder = realpct_myorder / realpct_myorder[-1]
    
    # convert counts of data into percents
    N = iot_1c_test_tgt.size
    xs = np.linspace(1/N, 1, N)
    
    fig, ax = plt.subplots(figsize=(8,4))
    fig.tight_layout()
    
    ax.plot(xs, realpct_myorder / np.where(xs > 0, xs, 1))
    
    ax.set_title("Lift " + name)
    ax.set_ylabel("X-Fold Improvement")
    ax.set_xlabel("Percent of Population\n" + "Starting with Highest Predicted Hits")
    ax.yaxis.tick_right()
    ax.yaxis.set_label_position('right')

In [None]:
#Lift one vs rest

iot_multi_tgt = skpre.label_binarize(data_v2['target'], [0,1,2,3,4,5,6,7,8,9, 10])


(iotmulti_train_ftrs, iotmulti_test_ftrs,
iotmulti_train_tgt, iotmulti_test_tgt) = skms.train_test_split(data_v2['data'], iot_multi_tgt, test_size=.33)

is_first = data_v2['target'] == 1
tts_1c = skms.train_test_split(data_v2['data'], is_first, test_size=.33)

(iot_1c_train_ftrs, iot_1c_test_ftrs,
iot_1c_train_tgt, iot_1c_test_tgt) = tts_1c

fig, ax = plt.subplots(figsize=(8,4))
fig.tight_layout()
for name, model in classifiers.items():
    prob_true = (model.fit(iot_1c_train_ftrs, iot_1c_train_tgt)
                .predict_proba(iot_1c_test_ftrs)[:,1])
    
    #negate because we want big values first
    myorder = np.argsort(-prob_true)
    
    #cumulative sum then to percent (last value is total)
    realpct_myorder = iot_1c_test_tgt[myorder].cumsum()
    realpct_myorder = realpct_myorder / realpct_myorder[-1]
    
    # convert counts of data into percents
    N = iot_1c_test_tgt.size
    xs = np.linspace(1/N, 1, N)
    
    ax.plot(xs, realpct_myorder / np.where(xs > 0, xs, 1), label="{}".format(name))
    
    ax.set_title("Lift " + name)
    ax.set_ylabel("X-Fold Improvement")
    ax.set_xlabel("Percent of Population\n" + "Starting with Highest Predicted Hits")
    ax.yaxis.tick_right()
    ax.yaxis.set_label_position('right')

# ROC and AUC (one vs rest)

In [None]:
iot_multi_tgt = skpre.label_binarize(data_v2['target'], [0,1,2,3,4,5,6,7,8,9,10])

(im_train_ftrs, im_test_ftrs,
im_train_tgt, im_test_tgt) = skms.train_test_split(data_v2['data'], iot_multi_tgt, test_size=.33)

for name, model in classifiers.items():
    ovr_model = skmulti.OneVsRestClassifier(model)
    pred_probs = ovr_model.fit(im_train_ftrs, im_train_tgt).predict_proba(im_test_ftrs)
    
    lbl_fmt = "Class {} vs Rest (AUC = {:.2f}) / {}"
    fig, ax = plt.subplots(figsize=(8,4))
    for cls in [0,1,2,3,4,5,6,7,8,9,10]:
        fpr, tpr, _ = metrics.roc_curve(im_test_tgt[:,cls],
                                       pred_probs[:,cls])
        label = lbl_fmt.format(cls, metrics.auc(fpr, tpr), name)
        ax.plot(fpr, tpr, 'o--', label=label)
        
    ax.legend()
    ax.set_xlabel("FPR")
    ax.set_ylabel("TPR")

# Precision (one vs rest)

In [None]:
#Precision

iot_multi_tgt = skpre.label_binarize(data_v2['target'], [0,1,2,3,4,5,6,7,8,9,10])

(im_train_ftrs, im_test_ftrs,
 im_train_tgt, im_test_tgt) = skms.train_test_split(data_v2['data'],
                                                   iot_multi_tgt,
                                                   test_size=.33)

for name, model in classifiers.items():
    ovr_model = skmulti.OneVsRestClassifier(model)
    pred_probs = ovr_model.fit(im_train_ftrs, im_train_tgt).predict_proba(im_test_ftrs)
    
    lbl_format = "Class {} vs Rest (AUC = {:.2f}) / {}"
    fig, ax = plt.subplots(figsize=(8,4))
    for cls in [0,1,2,3,4,5,6,7,8,9,10]:
        prc = metrics.precision_recall_curve
        precision, recall, _ =prc(im_test_tgt[:,cls],
                                 pred_probs[:,cls])
        prc_auc = metrics.auc(recall, precision)
        label = lbl_format.format(cls, prc_auc, name)
        ax.plot(recall, precision, 'o--', label=label)
        
    ax.legend()
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")

# Lift

In [None]:
fig, (ax2) = plt.subplots(1, 1, figsize=(20, 10))
N = len(iot_train_tgt)
xs = np.linspace(1/N, 1, N)
for name, model in classifiers.items():
    # Negate so big values come first
    myorder = np.argsort(-cv_prob_true[name])
    realpct_myorder = iot_train_tgt[myorder].cumsum()
    realpct_myorder = realpct_myorder /realpct_myorder[-1]
    ax2.plot(xs, realpct_myorder / np.where(xs > 0, xs, 1), label=name)
ax2.legend()
ax2.set_title("Lift versus Random")

# Precision Curves

In [None]:
macro_precision = metrics.make_scorer(metrics.precision_score, average='macro')
macro_recall = metrics.make_scorer(metrics.recall_score, average='macro')

msrs = ['accuracy', macro_precision, macro_recall]

fig, axes = plt.subplots(len(msrs), 1, figsize=(10, 2*len(msrs)))
fig.tight_layout()

for name, model in classifiers.items():
    cvs = skms.cross_val_score
    cvs_results = {msr:cvs(model, iot_train, iot_train_tgt,
                          scoring=msr, cv=10) for msr in msrs}
    
    for ax, msr in zip(axes, msrs):
        msr_results = cv_results[msr]
        my_lbl = "{:12s} {:.3f} {:.2f}".format(name, msr_results.mean(), msr_results.std())
        ax.plot(msr_results, 'o--', label=my_lbl)
        ax.set_title(msr)
        ax.legend(loc='lower left')

In [None]:
fig, ax = plt.subplots(figsize=(6,4))

for name, model in classifiers.items():
    cv_scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'], cv=10,
                                    scoring='accuracy', n_jobs=-1)
    my_lbl= "{} {:4.3f}".format(name, cv_scores.mean())
    ax.plot(cv_scores, '-o', label=my_lbl)
    
ax.set_ylim(0.0, 1.1)
ax.set_xlabel('Fold')
ax.set_ylabel('Accuracy')
ax.legend(ncol=2)

# -

In [None]:
import seaborn as sns

model = neighbors.KNeighborsClassifier(n_neighbors=3)
scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'],
                             cv=5, scoring='neg_mean_squared_error') 
scores = pd.Series(np.sqrt(-scores))

df = pd.DataFrame({'RMSE':scores})
df.index.name = 'Repeat'
display(df.describe().T)
ax = sns.swarmplot(y='RMSE', data=df)
ax.set_xlabel('Over Repeated\nTrain-Test Splits')