In [None]:
# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory
copyfile(src = "../input/d/shikomba/packages/mlwpy.py", dst = "../working/mlwpy.py")

In [None]:
from mlwpy import *
%matplotlib inline

In [None]:
benign = pd.read_csv('../input/nbaiot-dataset/1.benign.csv')
mirai_scan = pd.read_csv('../input/nbaiot-dataset/1.mirai.scan.csv')

# Using half the of benign's population for sample to match the number of 
# instances of mirai_ack which is a bigger dataset
benign = benign.sample(frac=0.50, replace=False)
mirai_ack = mirai_scan.sample(frac=0.25, replace=False)

benign['type']='benign'
mirai_scan['type']='mirai_scan'

data = pd.concat([benign, mirai_scan], axis=0, sort=False, ignore_index=True)

In [None]:
#Show how many instance of each class in the dataset
data.groupby('type')['type'].count()

In [None]:
#Shuffling rows of the dataframe
sampler = np.random.permutation(len(data))
data = data.take(sampler)
data.head()

In [None]:
# dummy encode labels, stored separately
labels_full=pd.get_dummies(data['type'], prefix='type')
labels_full.tail()

In [None]:
# Standardize numerical columns
def standardize(df, col):
    df[col] = (df[col]-df[col].mean())/df[col].std()
    
data_st=data.copy()

for i in (data.drop(['type'], axis=1).keys()):
    standardize(data, i)
    
data_st.head()

In [None]:
# Fix target column data
data_st.type = data_st.type.replace('benign', 1).replace('mirai_scan', 0)

In [None]:
#drop labels(targets) from training dataset
data_v2 = {'data': np.array(data_st.drop(['type'], axis=1).values), 'target': np.array(data_st['type'].values)} 

In [None]:
# Split data into Train/test 25% datasets
(iot_train, iot_test,
 iot_train_tgt, iot_test_tgt) = skms.train_test_split(data_v2['data'], 
                                                 data_v2['target'], 
                                                 test_size=0.25)

In [None]:
# Create and fit models
models = {'3NN': neighbors.KNeighborsClassifier(n_neighbors=3),
          '5NN':neighbors.KNeighborsClassifier(n_neighbors=5),
          '10NN': neighbors.KNeighborsClassifier(n_neighbors=10),
          'NB': naive_bayes.GaussianNB()}

for name, model in models.items():
    fit = model.fit(iot_train, iot_train_tgt)
    preds = fit.predict(iot_test)

    knn_score =metrics.accuracy_score(iot_test_tgt, preds)
    print("{:>4s}: {:5.2f}".format(name, knn_score))

In [None]:
classifiers = {"3NN" : neighbors.KNeighborsClassifier(n_neighbors=3),
               "5NN" : neighbors.KNeighborsClassifier(n_neighbors=5),
               "10NN": neighbors.KNeighborsClassifier(n_neighbors=10),
               "NB"  : naive_bayes.GaussianNB()}

fig, ax = plt.subplots(figsize=(6,4))

for name, model in classifiers.items():
    cv_scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'], cv=10,
                                    scoring='accuracy', n_jobs=-1)
    my_lbl= "{} {:4.3f}".format(name, cv_scores.mean())
    ax.plot(cv_scores, '-o', label=my_lbl)
    
ax.set_ylim(0.0, 1.1)
ax.set_xlabel('Fold')
ax.set_ylabel('Accuracy')
ax.legend(ncol=2)

In [None]:
import seaborn as sns

model = neighbors.KNeighborsClassifier(n_neighbors=3)
scores = skms.cross_val_score(model, data_v2['data'], data_v2['target'],
                             cv=5, scoring='neg_mean_squared_error') 
scores = pd.Series(np.sqrt(-scores))

df = pd.DataFrame({'RMSE':scores})
df.index.name = 'Repeat'
display(df.describe().T)
ax = sns.swarmplot(y='RMSE', data=df)
ax.set_xlabel('Over Repeated\nTrain-Test Splits')