In [None]:
# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory
copyfile(src = "../input/d/shikomba/packages/mlwpy.py", dst = "../working/mlwpy.py")

In [None]:
from mlwpy import *
%matplotlib inline

# Preparing the dataset

In [None]:
benign = pd.read_csv('../input/nbaiot-dataset/1.benign.csv')
mirai_ack = pd.read_csv('../input/nbaiot-dataset/1.mirai.ack.csv')

# Using half the of benign's population for sample to match the number of 
# instances of mirai_ack which is a bigger dataset
benign = benign.sample(frac=0.50, replace=False)
mirai_ack = mirai_ack.sample(frac=0.25, replace=False)

benign['type']='benign'
mirai_ack['type']='mirai_ack'

data = pd.concat([benign, mirai_ack], axis=0, sort=False, ignore_index=True)

In [None]:
#Show how many instance of each class in the dataset
data.groupby('type')['type'].count()

# Shuffle the data

In [None]:
#Shuffling rows of the dataframe
sampler = np.random.permutation(len(data))
data = data.take(sampler)
data.head()

# Dataset Normalisation

In [None]:
features = data.drop(['type'], axis=1)
target = data.filter(['type'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_st = scaler.fit_transform(features)

In [None]:
# Encode target column data
target.type = target.type.replace('benign', 1).replace('mirai_ack', 0)

In [None]:
# Create dataset
data_v2 = {'data': data_st, 'target': np.array(target['type'].values)} 

# Split Train and Test Data

In [None]:
# Split data into Train/test 25% datasets
(iot_train, iot_test,
 iot_train_tgt, iot_test_tgt) = skms.train_test_split(data_v2['data'], 
                                                 data_v2['target'], 
                                                 test_size=0.25)

# Train a KNN and a Naive Bayes Model to Classify Attacks

In [None]:
classifiers = {'QDA': discriminant_analysis.QuadraticDiscriminantAnalysis(),
              '5NN': neighbors.KNeighborsClassifier(n_neighbors=5),
              'DTC': tree.DecisionTreeClassifier(),
              'NB': naive_bayes.GaussianNB()}

In [None]:
for name, model in classifiers.items():
    fit = model.fit(iot_train, iot_train_tgt)
    preds = fit.predict(iot_test)

    knn_score =metrics.accuracy_score(iot_test_tgt, preds)
    print("{:>4s}: {:5.2f}".format(name, knn_score))

In [None]:
fig, ax = plt.subplots(figsize=(6,4))

for name, model in classifiers.items():
    cv_scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'], cv=10,
                                    scoring='accuracy', n_jobs=-1)
    my_lbl= "{} {:4.3f}".format(name, cv_scores.mean())
    ax.plot(cv_scores, '-o', label=my_lbl)
    
ax.set_ylim(0.0, 1.1)
ax.set_xlabel('Fold')
ax.set_ylabel('Accuracy')
ax.legend(ncol=2)
ax.show()

# Classifier Evaluation

In [None]:
# 10 data set sizes: 10% - 100%
# (that much data is piped to a 5-fold CV)
train_sizes = np.linspace(.1, 1.0, 10)
nn = neighbors.KNeighborsClassifier()
(train_N, train_scores, test_scores) = skms.learning_curve(nn, data_v2['data'], data_v2['target'], cv=5, train_sizes=train_sizes)

# collapse across the 5 CV scores; one result for each data set size
df = pd.DataFrame(test_scores, index=(train_sizes*100).astype(np.int))
df['Mean 5-CV'] = df.mean(axis='columns')
df.index.name = "% Data Used"
display(df)

# Classifier Comparison

In [None]:
import seaborn as sns

model = neighbors.KNeighborsClassifier(n_neighbors=3)
scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'],
                             cv=5, scoring='neg_mean_squared_error') 
scores = pd.Series(np.sqrt(-scores))

df = pd.DataFrame({'RMSE':scores})
df.index.name = 'Repeat'
display(df.describe().T)
ax = sns.swarmplot(y='RMSE', data=df)
ax.set_xlabel('Over Repeated\nTrain-Test Splits')