In [1]:
import os
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.externals import joblib
from sklearn.utils import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, LabelEncoder
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from keras import Sequential
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dropout, Dense
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Load cleaned data from pickle file

In [2]:
cleaned_df = joblib.load('pickles/cleaned_df.pkl')

Load top 50 features and extract top 50 features into a list

In [3]:
trimmed_feats_list = joblib.load('pickles/57var.pkl')
trimmed_feats_list.append('device_category')

Filter dataset to only use the 57 relevant features

In [4]:
filtered_df = cleaned_df.filter(items=trimmed_feats_list, axis='columns')

In [5]:
filtered_df.shape

(399348, 57)

Obtain list of unique classes and create list of names of feature engineered columns

In [47]:
distinct_classes = filtered_df.device_category.unique()
training_cols = ['sd', 'sem', 'var', 'skew', 'kurt', 'mad', 'shortest_dist_sd', 'shortest_dist_diff', 'sd_diff_smallest2next', 'known']

Build training dataset for known/unknown detection classifier

In [48]:
training_df = pd.DataFrame()
knn = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=30)

for out_class in distinct_classes:
    nine_classes_list = distinct_classes.tolist()
    nine_classes_list.remove(out_class)
    
    in_classes_samples = pd.DataFrame()
    nine_classes_df = filtered_df[filtered_df['device_category'] != out_class]
    out_class_df = filtered_df[filtered_df['device_category'] == out_class]
    
    for in_class in nine_classes_list:
        in_class_df = filtered_df[filtered_df['device_category'] == in_class]
        in_class_sample = in_class_df.sample(500, random_state=1, axis=0, replace=True)
        index_to_drop = in_class_sample.index.tolist()
        nine_classes_df = nine_classes_df.drop(index_to_drop)
        in_class_sample = in_class_sample.reset_index()
        in_classes_samples = pd.concat([in_classes_samples, in_class_sample], axis=0, ignore_index=True)
    
    train_x, _, train_y, _ = train_test_split(nine_classes_df.iloc[:, :-1], nine_classes_df.iloc[:, -1], train_size=900, 
                                              test_size=9, random_state=1, shuffle=True, stratify=nine_classes_df.iloc[:, -1])
    knn.fit(train_x, train_y)
    
    val_x, val_y = in_classes_samples.iloc[:, 1:-1], in_classes_samples.iloc[:, -1]
    val_output = knn.kneighbors(val_x, n_neighbors=9, return_distance=True)
    val_distance_df = pd.DataFrame([[distance for distance in row] for row in val_output[0]])
    val_sd = val_distance_df.std(axis=1, numeric_only=True)
    val_min = val_distance_df.min(axis=1, numeric_only=True)
    val_mean = val_distance_df.mean(axis=1, numeric_only=True)
    val_2_smallest_dist_df = pd.DataFrame(np.sort(val_distance_df.values))
    val_2nd_smallest_dist_df = val_2_smallest_dist_df.iloc[:, -2]
    val_smallest_dist_df = val_2_smallest_dist_df.iloc[:, -1]
    
    val_distance_df['sd'] = val_sd
    val_distance_df['sem'] = val_distance_df.sem(axis=1, numeric_only=True)
    val_distance_df['var'] = val_distance_df.var(axis=1, numeric_only=True)
    val_distance_df['skew'] = val_distance_df.skew(axis=1, numeric_only=True)
    val_distance_df['kurt'] = val_distance_df.kurt(axis=1, numeric_only=True)
    val_distance_df['mad'] = val_distance_df.mad(axis=1)
    val_distance_df['shortest_dist_sd'] = val_smallest_dist_df / val_sd
    val_distance_df['shortest_dist_diff'] = (val_mean - val_min) / val_sd
    val_distance_df['sd_diff_smallest2next'] = (val_2nd_smallest_dist_df - val_min) / val_sd 
    val_distance_df['known'] = pd.Series([1 for _ in range(len(val_distance_df.index))]) 
    
    
    out_class_df = out_class_df.sample(n=4500, random_state=1, axis=0, replace=True)
    test_x, test_y = out_class_df.iloc[:, :-1], out_class_df.iloc[:, -1]
    test_output = knn.kneighbors(test_x, n_neighbors=9, return_distance=True)
    
    test_distance_df = pd.DataFrame([[distance for distance in row] for row in test_output[0]])
    test_sd = test_distance_df.std(axis=1, numeric_only=True)
    test_min = test_distance_df.min(axis=1, numeric_only=True)
    test_mean = test_distance_df.mean(axis=1, numeric_only=True)
    test_2_smallest_dist_df = pd.DataFrame(np.sort(test_distance_df.values))
    test_2nd_smallest_dist_df = test_2_smallest_dist_df.iloc[:, -2]
    test_smallest_dist_df = test_2_smallest_dist_df.iloc[:, -1]

    
    test_distance_df['sd'] = test_sd
    test_distance_df['sem'] = test_distance_df.sem(axis=1, numeric_only=True)
    test_distance_df['var'] = test_distance_df.var(axis=1, numeric_only=True)
    test_distance_df['skew'] = test_distance_df.skew(axis=1, numeric_only=True)
    test_distance_df['kurt'] = test_distance_df.kurt(axis=1, numeric_only=True)
    test_distance_df['mad'] = test_distance_df.mad(axis=1)
    test_distance_df['shortest_dist_sd'] = test_smallest_dist_df / test_sd
    test_distance_df['shortest_dist_diff'] = (test_mean - test_min) / test_sd
    test_distance_df['sd_diff_smallest2next'] = (test_2nd_smallest_dist_df - test_min) / test_sd 
    test_distance_df['known'] = pd.Series([0 for _ in range(len(test_distance_df.index))]) 
    
    
    combined_df = pd.concat([val_distance_df, test_distance_df], axis=0, ignore_index=True).filter(items=training_cols, axis='columns')
    training_df = pd.concat([training_df, combined_df], axis=0, ignore_index=True)

Check counts of labels

In [49]:
training_df['known'].value_counts()

1    45000
0    45000
Name: known, dtype: int64

Clean up the dataset

In [50]:
training_df = training_df.replace([np.inf, -np.inf], np.nan)
training_df = training_df.replace(np.nan, 0)

Save the labelled dataset obtained from KNN models

In [51]:
joblib.dump(training_df, 'pickles/known_classifier_dataset_v4.pkl', compress=9)

['pickles/known_classifier_dataset_v4.pkl']

Load the labelled dataset obtained from KNN models

In [52]:
training_df = joblib.load('pickles/known_classifier_dataset_v4.pkl')

In [118]:
training_df.head()

Unnamed: 0,sd,sem,var,skew,kurt,mad,shortest_dist_sd,shortest_dist_diff,sd_diff_smallest2next,known
0,1.4e-05,0.028573,0.012098,-1.336372,10.752406,1.413319,19755.275816,0.974932,2.185684,1
1,0.004105,0.00142,2.6e-05,-0.359471,12.909395,1.715045,3.768646,1.942897,2.882156,1
2,0.002004,0.000802,9e-06,-0.434234,12.976511,1.7251,5.493141,1.27881,1.885052,1
3,0.062109,0.018761,0.003192,2.760041,12.842279,1.902902,3.101878,0.501448,0.733738,1
4,70.996114,22.394521,5703.710788,3.451707,12.918827,742.258942,3.667352,0.966265,1.835213,1


### Prepare feature engineered dataset, splitting to train-test using 80-20 ratio
Scale using standard scaling so that models are able to better learn from dataset

In [53]:
x, y = training_df.drop('known', axis=1), training_df['known']
class_weights = compute_class_weight('balanced', np.unique(y), y)
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### Logistic Regression Model for KnownUnknown Class Detection
#### Using Stochastic Average Gradient for negate the large dataset impact on training time

In [54]:
log_reg = LogisticRegression(penalty='l2', dual=False, class_weight='balanced', random_state=5, max_iter=500, 
                             multi_class='ovr', verbose=2, n_jobs=-1, solver='sag')
log_reg.fit(x_train, y_train)

convergence after 30 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.0s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=5,
          solver='sag', tol=0.0001, verbose=2, warm_start=False)

Logistic Regression Model Predictions

In [55]:
log_reg_preds = log_reg.predict(x_test)

Classification Report to validate model performance

In [56]:
print(classification_report(y_test, log_reg_preds))

             precision    recall  f1-score   support

          0       0.58      0.68      0.62      8980
          1       0.61      0.51      0.56      9020

avg / total       0.60      0.59      0.59     18000



### Random Forest Model for KnownUnknown Class Detection
#### Using 1000 Decision Trees, Bagging and balancing class weights

In [57]:
rf = RandomForestClassifier(1000, criterion='gini', oob_score=True, n_jobs=-1,
                            random_state=5, class_weight='balanced', verbose=1)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.3min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=True, random_state=5,
            verbose=1, warm_start=False)

Random Forest Model Predictions

In [58]:
rf_preds = rf.predict(x_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    3.6s finished


In [59]:
print(classification_report(y_test, rf_preds))

             precision    recall  f1-score   support

          0       0.92      0.87      0.89      8980
          1       0.87      0.92      0.90      9020

avg / total       0.90      0.90      0.90     18000



### XGBoost Model for KnownUnknown Class Detection

In [60]:
dtrain = xgb.DMatrix(x_train, label=np.array(y_train))
dtest = xgb.DMatrix(x_test, label=np.array(y_test))

In [124]:
x_train.shape

(72000, 9)

In [76]:
param = {
    'max_depth': 10,
    'eta': 0.1,
    'silent': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}
num_round = 1000

In [77]:
bst = xgb.train(param, dtrain, num_round, evals=[(dtest,'eval'), (dtrain,'train')])

[0]	eval-error:0.243778	train-error:0.240611
[1]	eval-error:0.240833	train-error:0.235528
[2]	eval-error:0.242167	train-error:0.236486
[3]	eval-error:0.242444	train-error:0.234361
[4]	eval-error:0.242722	train-error:0.234861
[5]	eval-error:0.244667	train-error:0.239167
[6]	eval-error:0.244278	train-error:0.238736
[7]	eval-error:0.244	train-error:0.238125
[8]	eval-error:0.239722	train-error:0.233222
[9]	eval-error:0.240222	train-error:0.234194
[10]	eval-error:0.239389	train-error:0.233181
[11]	eval-error:0.239778	train-error:0.233319
[12]	eval-error:0.239944	train-error:0.233278
[13]	eval-error:0.240611	train-error:0.233458
[14]	eval-error:0.240167	train-error:0.232972
[15]	eval-error:0.231	train-error:0.220639
[16]	eval-error:0.2325	train-error:0.221653
[17]	eval-error:0.232556	train-error:0.222236
[18]	eval-error:0.232944	train-error:0.221903
[19]	eval-error:0.224333	train-error:0.213069
[20]	eval-error:0.224111	train-error:0.212778
[21]	eval-error:0.223056	train-error:0.211944
[22]	e

[178]	eval-error:0.207167	train-error:0.17475
[179]	eval-error:0.206833	train-error:0.174667
[180]	eval-error:0.206722	train-error:0.174375
[181]	eval-error:0.206778	train-error:0.174236
[182]	eval-error:0.206722	train-error:0.174194
[183]	eval-error:0.206556	train-error:0.174153
[184]	eval-error:0.206556	train-error:0.174069
[185]	eval-error:0.206611	train-error:0.173986
[186]	eval-error:0.206556	train-error:0.173903
[187]	eval-error:0.206444	train-error:0.173903
[188]	eval-error:0.206444	train-error:0.173764
[189]	eval-error:0.206444	train-error:0.173764
[190]	eval-error:0.206444	train-error:0.173625
[191]	eval-error:0.206611	train-error:0.173278
[192]	eval-error:0.2065	train-error:0.173
[193]	eval-error:0.2065	train-error:0.173
[194]	eval-error:0.2065	train-error:0.172972
[195]	eval-error:0.2065	train-error:0.172958
[196]	eval-error:0.2065	train-error:0.172931
[197]	eval-error:0.2065	train-error:0.172917
[198]	eval-error:0.206444	train-error:0.172903
[199]	eval-error:0.2065	train-er

[355]	eval-error:0.203944	train-error:0.165417
[356]	eval-error:0.203778	train-error:0.165375
[357]	eval-error:0.203667	train-error:0.165389
[358]	eval-error:0.203778	train-error:0.165347
[359]	eval-error:0.203778	train-error:0.165347
[360]	eval-error:0.203944	train-error:0.165292
[361]	eval-error:0.203778	train-error:0.165278
[362]	eval-error:0.203889	train-error:0.165125
[363]	eval-error:0.204111	train-error:0.165125
[364]	eval-error:0.204167	train-error:0.165111
[365]	eval-error:0.204278	train-error:0.165042
[366]	eval-error:0.204278	train-error:0.165042
[367]	eval-error:0.204333	train-error:0.165042
[368]	eval-error:0.204444	train-error:0.164972
[369]	eval-error:0.204444	train-error:0.164972
[370]	eval-error:0.204389	train-error:0.164972
[371]	eval-error:0.204389	train-error:0.164972
[372]	eval-error:0.2045	train-error:0.164958
[373]	eval-error:0.2045	train-error:0.164931
[374]	eval-error:0.204278	train-error:0.164931
[375]	eval-error:0.204389	train-error:0.164819
[376]	eval-error:

[531]	eval-error:0.203222	train-error:0.162056
[532]	eval-error:0.203111	train-error:0.162028
[533]	eval-error:0.203111	train-error:0.162
[534]	eval-error:0.203	train-error:0.161903
[535]	eval-error:0.203056	train-error:0.161861
[536]	eval-error:0.203	train-error:0.161806
[537]	eval-error:0.202944	train-error:0.161736
[538]	eval-error:0.202778	train-error:0.161667
[539]	eval-error:0.202722	train-error:0.161653
[540]	eval-error:0.202889	train-error:0.161639
[541]	eval-error:0.202889	train-error:0.161625
[542]	eval-error:0.202944	train-error:0.161625
[543]	eval-error:0.203	train-error:0.161625
[544]	eval-error:0.203056	train-error:0.161611
[545]	eval-error:0.203111	train-error:0.161597
[546]	eval-error:0.203056	train-error:0.161583
[547]	eval-error:0.203056	train-error:0.161528
[548]	eval-error:0.203056	train-error:0.161528
[549]	eval-error:0.203167	train-error:0.161528
[550]	eval-error:0.203222	train-error:0.161542
[551]	eval-error:0.203167	train-error:0.161458
[552]	eval-error:0.203333

[707]	eval-error:0.203167	train-error:0.160125
[708]	eval-error:0.203278	train-error:0.160111
[709]	eval-error:0.203222	train-error:0.160111
[710]	eval-error:0.203222	train-error:0.160111
[711]	eval-error:0.203222	train-error:0.160097
[712]	eval-error:0.203222	train-error:0.160097
[713]	eval-error:0.203222	train-error:0.160097
[714]	eval-error:0.203222	train-error:0.160097
[715]	eval-error:0.203222	train-error:0.160083
[716]	eval-error:0.203222	train-error:0.160083
[717]	eval-error:0.203222	train-error:0.160083
[718]	eval-error:0.203222	train-error:0.160083
[719]	eval-error:0.203222	train-error:0.160083
[720]	eval-error:0.203222	train-error:0.160083
[721]	eval-error:0.203111	train-error:0.160083
[722]	eval-error:0.203167	train-error:0.160083
[723]	eval-error:0.203111	train-error:0.160083
[724]	eval-error:0.203111	train-error:0.160083
[725]	eval-error:0.203111	train-error:0.160083
[726]	eval-error:0.203111	train-error:0.160083
[727]	eval-error:0.203111	train-error:0.160083
[728]	eval-er

[884]	eval-error:0.2035	train-error:0.159292
[885]	eval-error:0.2035	train-error:0.159292
[886]	eval-error:0.203611	train-error:0.159292
[887]	eval-error:0.203611	train-error:0.159306
[888]	eval-error:0.203611	train-error:0.159306
[889]	eval-error:0.203611	train-error:0.159319
[890]	eval-error:0.203444	train-error:0.159292
[891]	eval-error:0.203444	train-error:0.159292
[892]	eval-error:0.203444	train-error:0.159306
[893]	eval-error:0.2035	train-error:0.159306
[894]	eval-error:0.203333	train-error:0.159306
[895]	eval-error:0.203333	train-error:0.159306
[896]	eval-error:0.203333	train-error:0.159306
[897]	eval-error:0.203278	train-error:0.159292
[898]	eval-error:0.203333	train-error:0.159292
[899]	eval-error:0.203333	train-error:0.159292
[900]	eval-error:0.203278	train-error:0.159292
[901]	eval-error:0.203333	train-error:0.159292
[902]	eval-error:0.203389	train-error:0.159306
[903]	eval-error:0.203389	train-error:0.159306
[904]	eval-error:0.203333	train-error:0.159306
[905]	eval-error:0.

In [155]:
xgb_prob_preds = bst.predict(dtest)
xgb_preds = np.asarray([1 if line > 0.5 else 0 for line in xgb_prob_preds]).astype('int64')

In [79]:
print(classification_report(y_test, rf_preds))

             precision    recall  f1-score   support

          0       0.92      0.87      0.89      8980
          1       0.87      0.92      0.90      9020

avg / total       0.90      0.90      0.90     18000



### Neural Network Model for KnownUnknown Class Detection (3 Hidden Layers)
#### Using relu as activation functions for hidden layers due to its robustness and the absence of vanishing gradient problem when using it. Sigmoid is used as the activation function for the output as it is a binary classification problem.

#### Loss function of binary crossentropy is used as it is a binary problem.

In [112]:
model_name = 'known_classifier_nn.h5'

nn_model = Sequential()

nn_model.add(Dense(250, activation='relu', input_shape=tuple(x.shape[1:])))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(500, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1000, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(500, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(250, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))

op = optimizers.Adam(lr=0.0001)

nn_model.compile(optimizer=op, metrics=['accuracy'], loss='binary_crossentropy')

save_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1)
early_stop = EarlyStopping(min_delta=0.01, patience=500, verbose=1, mode='min')

nn_model.fit(x_train, y_train, epochs=50000, batch_size=16, verbose=2, 
             class_weight=class_weights, shuffle=True, validation_split=0.2, callbacks=[save_checkpoint, early_stop])

Train on 57600 samples, validate on 14400 samples
Epoch 1/50000
 - 122s - loss: 0.6868 - acc: 0.5554 - val_loss: 0.6709 - val_acc: 0.5875

Epoch 00001: val_loss improved from inf to 0.67086, saving model to known_classifier_nn.h5
Epoch 2/50000
 - 123s - loss: 0.6740 - acc: 0.5789 - val_loss: 0.6626 - val_acc: 0.6030

Epoch 00002: val_loss improved from 0.67086 to 0.66258, saving model to known_classifier_nn.h5
Epoch 3/50000
 - 115s - loss: 0.6677 - acc: 0.5895 - val_loss: 0.6559 - val_acc: 0.6033

Epoch 00003: val_loss improved from 0.66258 to 0.65590, saving model to known_classifier_nn.h5
Epoch 4/50000
 - 120s - loss: 0.6592 - acc: 0.5960 - val_loss: 0.6382 - val_acc: 0.6188

Epoch 00004: val_loss improved from 0.65590 to 0.63821, saving model to known_classifier_nn.h5
Epoch 5/50000
 - 120s - loss: 0.6442 - acc: 0.6060 - val_loss: 0.6275 - val_acc: 0.6284

Epoch 00005: val_loss improved from 0.63821 to 0.62753, saving model to known_classifier_nn.h5
Epoch 6/50000
 - 121s - loss: 0.63


Epoch 00048: val_loss improved from 0.54677 to 0.54138, saving model to known_classifier_nn.h5
Epoch 49/50000
 - 135s - loss: 0.5623 - acc: 0.6859 - val_loss: 0.5412 - val_acc: 0.7062

Epoch 00049: val_loss improved from 0.54138 to 0.54123, saving model to known_classifier_nn.h5
Epoch 50/50000
 - 153s - loss: 0.5611 - acc: 0.6878 - val_loss: 0.5365 - val_acc: 0.7085

Epoch 00050: val_loss improved from 0.54123 to 0.53650, saving model to known_classifier_nn.h5
Epoch 51/50000
 - 127s - loss: 0.5584 - acc: 0.6896 - val_loss: 0.5377 - val_acc: 0.7080

Epoch 00051: val_loss did not improve
Epoch 52/50000
 - 129s - loss: 0.5557 - acc: 0.6903 - val_loss: 0.5398 - val_acc: 0.7063

Epoch 00052: val_loss did not improve
Epoch 53/50000
 - 127s - loss: 0.5552 - acc: 0.6909 - val_loss: 0.5357 - val_acc: 0.7058

Epoch 00053: val_loss improved from 0.53650 to 0.53566, saving model to known_classifier_nn.h5
Epoch 54/50000
 - 127s - loss: 0.5559 - acc: 0.6911 - val_loss: 0.5331 - val_acc: 0.7102

Epo


Epoch 00102: val_loss did not improve
Epoch 103/50000
 - 128s - loss: 0.5264 - acc: 0.7153 - val_loss: 0.5052 - val_acc: 0.7336

Epoch 00103: val_loss did not improve
Epoch 104/50000
 - 150s - loss: 0.5265 - acc: 0.7159 - val_loss: 0.5007 - val_acc: 0.7375

Epoch 00104: val_loss did not improve
Epoch 105/50000
 - 137s - loss: 0.5237 - acc: 0.7167 - val_loss: 0.5009 - val_acc: 0.7356

Epoch 00105: val_loss did not improve
Epoch 106/50000
 - 145s - loss: 0.5243 - acc: 0.7182 - val_loss: 0.4994 - val_acc: 0.7382

Epoch 00106: val_loss did not improve
Epoch 107/50000
 - 145s - loss: 0.5230 - acc: 0.7178 - val_loss: 0.4971 - val_acc: 0.7427

Epoch 00107: val_loss improved from 0.49825 to 0.49713, saving model to known_classifier_nn.h5
Epoch 108/50000
 - 154s - loss: 0.5218 - acc: 0.7182 - val_loss: 0.4969 - val_acc: 0.7379

Epoch 00108: val_loss improved from 0.49713 to 0.49689, saving model to known_classifier_nn.h5
Epoch 109/50000
 - 157s - loss: 0.5239 - acc: 0.7175 - val_loss: 0.4974 -


Epoch 00162: val_loss did not improve
Epoch 163/50000
 - 128s - loss: 0.5170 - acc: 0.7219 - val_loss: 0.4847 - val_acc: 0.7482

Epoch 00163: val_loss did not improve
Epoch 164/50000
 - 121s - loss: 0.5176 - acc: 0.7200 - val_loss: 0.4923 - val_acc: 0.7460

Epoch 00164: val_loss did not improve
Epoch 165/50000
 - 120s - loss: 0.5182 - acc: 0.7200 - val_loss: 0.4833 - val_acc: 0.7489

Epoch 00165: val_loss did not improve
Epoch 166/50000
 - 130s - loss: 0.5181 - acc: 0.7199 - val_loss: 0.4860 - val_acc: 0.7476

Epoch 00166: val_loss did not improve
Epoch 167/50000
 - 114s - loss: 0.5184 - acc: 0.7227 - val_loss: 0.4813 - val_acc: 0.7527

Epoch 00167: val_loss improved from 0.48225 to 0.48133, saving model to known_classifier_nn.h5
Epoch 168/50000
 - 116s - loss: 0.5172 - acc: 0.7215 - val_loss: 0.4822 - val_acc: 0.7494

Epoch 00168: val_loss did not improve
Epoch 169/50000
 - 113s - loss: 0.5098 - acc: 0.7252 - val_loss: 0.4794 - val_acc: 0.7484

Epoch 00169: val_loss improved from 0.4

 - 114s - loss: 0.5032 - acc: 0.7308 - val_loss: 0.4727 - val_acc: 0.7549

Epoch 00221: val_loss did not improve
Epoch 222/50000
 - 116s - loss: 0.5050 - acc: 0.7307 - val_loss: 0.4759 - val_acc: 0.7503

Epoch 00222: val_loss did not improve
Epoch 223/50000
 - 113s - loss: 0.5059 - acc: 0.7297 - val_loss: 0.4738 - val_acc: 0.7542

Epoch 00223: val_loss did not improve
Epoch 224/50000
 - 113s - loss: 0.5115 - acc: 0.7249 - val_loss: 0.4798 - val_acc: 0.7554

Epoch 00224: val_loss did not improve
Epoch 225/50000
 - 114s - loss: 0.5146 - acc: 0.7219 - val_loss: 0.4771 - val_acc: 0.7558

Epoch 00225: val_loss did not improve
Epoch 226/50000
 - 114s - loss: 0.5123 - acc: 0.7264 - val_loss: 0.4781 - val_acc: 0.7542

Epoch 00226: val_loss did not improve
Epoch 227/50000
 - 114s - loss: 0.5091 - acc: 0.7278 - val_loss: 0.4795 - val_acc: 0.7539

Epoch 00227: val_loss did not improve
Epoch 228/50000
 - 114s - loss: 0.5094 - acc: 0.7271 - val_loss: 0.4804 - val_acc: 0.7533

Epoch 00228: val_loss 


Epoch 00282: val_loss did not improve
Epoch 283/50000
 - 115s - loss: 0.5012 - acc: 0.7349 - val_loss: 0.4719 - val_acc: 0.7586

Epoch 00283: val_loss did not improve
Epoch 284/50000
 - 114s - loss: 0.5049 - acc: 0.7289 - val_loss: 0.4793 - val_acc: 0.7571

Epoch 00284: val_loss did not improve
Epoch 285/50000
 - 116s - loss: 0.5085 - acc: 0.7308 - val_loss: 0.4684 - val_acc: 0.7584

Epoch 00285: val_loss did not improve
Epoch 286/50000
 - 114s - loss: 0.5053 - acc: 0.7318 - val_loss: 0.4674 - val_acc: 0.7597

Epoch 00286: val_loss did not improve
Epoch 287/50000
 - 115s - loss: 0.5059 - acc: 0.7292 - val_loss: 0.4756 - val_acc: 0.7541

Epoch 00287: val_loss did not improve
Epoch 288/50000
 - 111s - loss: 0.5099 - acc: 0.7271 - val_loss: 0.4741 - val_acc: 0.7569

Epoch 00288: val_loss did not improve
Epoch 289/50000
 - 115s - loss: 0.5091 - acc: 0.7261 - val_loss: 0.4730 - val_acc: 0.7590

Epoch 00289: val_loss did not improve
Epoch 290/50000
 - 116s - loss: 0.5057 - acc: 0.7315 - val


Epoch 00345: val_loss did not improve
Epoch 346/50000
 - 118s - loss: 0.4982 - acc: 0.7345 - val_loss: 0.4671 - val_acc: 0.7601

Epoch 00346: val_loss did not improve
Epoch 347/50000
 - 112s - loss: 0.4996 - acc: 0.7359 - val_loss: 0.4661 - val_acc: 0.7582

Epoch 00347: val_loss did not improve
Epoch 348/50000
 - 117s - loss: 0.5021 - acc: 0.7355 - val_loss: 0.4709 - val_acc: 0.7585

Epoch 00348: val_loss did not improve
Epoch 349/50000
 - 116s - loss: 0.5047 - acc: 0.7324 - val_loss: 0.4662 - val_acc: 0.7602

Epoch 00349: val_loss did not improve
Epoch 350/50000
 - 115s - loss: 0.4986 - acc: 0.7360 - val_loss: 0.4641 - val_acc: 0.7627

Epoch 00350: val_loss improved from 0.46414 to 0.46407, saving model to known_classifier_nn.h5
Epoch 351/50000
 - 114s - loss: 0.4987 - acc: 0.7351 - val_loss: 0.4650 - val_acc: 0.7616

Epoch 00351: val_loss did not improve
Epoch 352/50000
 - 113s - loss: 0.4985 - acc: 0.7369 - val_loss: 0.4703 - val_acc: 0.7583

Epoch 00352: val_loss did not improve
E

 - 108s - loss: 0.5040 - acc: 0.7323 - val_loss: 0.4662 - val_acc: 0.7592

Epoch 00408: val_loss did not improve
Epoch 409/50000
 - 106s - loss: 0.5026 - acc: 0.7337 - val_loss: 0.4672 - val_acc: 0.7594

Epoch 00409: val_loss did not improve
Epoch 410/50000
 - 106s - loss: 0.5044 - acc: 0.7325 - val_loss: 0.4711 - val_acc: 0.7585

Epoch 00410: val_loss did not improve
Epoch 411/50000
 - 111s - loss: 0.5015 - acc: 0.7341 - val_loss: 0.4682 - val_acc: 0.7602

Epoch 00411: val_loss did not improve
Epoch 412/50000
 - 111s - loss: 0.5015 - acc: 0.7339 - val_loss: 0.4639 - val_acc: 0.7606

Epoch 00412: val_loss did not improve
Epoch 413/50000
 - 105s - loss: 0.4995 - acc: 0.7338 - val_loss: 0.4615 - val_acc: 0.7626

Epoch 00413: val_loss did not improve
Epoch 414/50000
 - 109s - loss: 0.4996 - acc: 0.7354 - val_loss: 0.4637 - val_acc: 0.7594

Epoch 00414: val_loss did not improve
Epoch 415/50000
 - 110s - loss: 0.5024 - acc: 0.7320 - val_loss: 0.4671 - val_acc: 0.7586

Epoch 00415: val_loss 


Epoch 00471: val_loss did not improve
Epoch 472/50000
 - 108s - loss: 0.4976 - acc: 0.7353 - val_loss: 0.4639 - val_acc: 0.7588

Epoch 00472: val_loss did not improve
Epoch 473/50000
 - 106s - loss: 0.4983 - acc: 0.7364 - val_loss: 0.4625 - val_acc: 0.7601

Epoch 00473: val_loss did not improve
Epoch 474/50000
 - 105s - loss: 0.4974 - acc: 0.7375 - val_loss: 0.4638 - val_acc: 0.7587

Epoch 00474: val_loss did not improve
Epoch 475/50000
 - 107s - loss: 0.5002 - acc: 0.7364 - val_loss: 0.4657 - val_acc: 0.7603

Epoch 00475: val_loss did not improve
Epoch 476/50000
 - 107s - loss: 0.4980 - acc: 0.7374 - val_loss: 0.4663 - val_acc: 0.7617

Epoch 00476: val_loss did not improve
Epoch 477/50000
 - 105s - loss: 0.4961 - acc: 0.7366 - val_loss: 0.4682 - val_acc: 0.7576

Epoch 00477: val_loss did not improve
Epoch 478/50000
 - 105s - loss: 0.4986 - acc: 0.7361 - val_loss: 0.4644 - val_acc: 0.7598

Epoch 00478: val_loss did not improve
Epoch 479/50000
 - 108s - loss: 0.4960 - acc: 0.7372 - val

Epoch 535/50000
 - 106s - loss: 0.4993 - acc: 0.7367 - val_loss: 0.4635 - val_acc: 0.7613

Epoch 00535: val_loss did not improve
Epoch 536/50000
 - 110s - loss: 0.4963 - acc: 0.7367 - val_loss: 0.4643 - val_acc: 0.7580

Epoch 00536: val_loss did not improve
Epoch 537/50000
 - 106s - loss: 0.4998 - acc: 0.7364 - val_loss: 0.4629 - val_acc: 0.7617

Epoch 00537: val_loss did not improve
Epoch 538/50000
 - 105s - loss: 0.4992 - acc: 0.7356 - val_loss: 0.4707 - val_acc: 0.7592

Epoch 00538: val_loss did not improve
Epoch 539/50000
 - 105s - loss: 0.4980 - acc: 0.7355 - val_loss: 0.4626 - val_acc: 0.7610

Epoch 00539: val_loss did not improve
Epoch 540/50000
 - 105s - loss: 0.4963 - acc: 0.7375 - val_loss: 0.4641 - val_acc: 0.7610

Epoch 00540: val_loss did not improve
Epoch 541/50000
 - 107s - loss: 0.4972 - acc: 0.7367 - val_loss: 0.4638 - val_acc: 0.7595

Epoch 00541: val_loss did not improve
Epoch 542/50000
 - 108s - loss: 0.4961 - acc: 0.7376 - val_loss: 0.4634 - val_acc: 0.7582

Epoch 

 - 107s - loss: 0.5007 - acc: 0.7350 - val_loss: 0.4640 - val_acc: 0.7597

Epoch 00598: val_loss did not improve
Epoch 599/50000
 - 106s - loss: 0.4990 - acc: 0.7359 - val_loss: 0.4700 - val_acc: 0.7549

Epoch 00599: val_loss did not improve
Epoch 600/50000
 - 111s - loss: 0.4991 - acc: 0.7353 - val_loss: 0.4642 - val_acc: 0.7598

Epoch 00600: val_loss did not improve
Epoch 601/50000
 - 105s - loss: 0.4984 - acc: 0.7368 - val_loss: 0.4679 - val_acc: 0.7553

Epoch 00601: val_loss did not improve
Epoch 602/50000
 - 104s - loss: 0.4969 - acc: 0.7370 - val_loss: 0.4632 - val_acc: 0.7608

Epoch 00602: val_loss did not improve
Epoch 603/50000
 - 106s - loss: 0.4983 - acc: 0.7370 - val_loss: 0.4632 - val_acc: 0.7576

Epoch 00603: val_loss did not improve
Epoch 604/50000
 - 105s - loss: 0.5036 - acc: 0.7330 - val_loss: 0.4818 - val_acc: 0.7430

Epoch 00604: val_loss did not improve
Epoch 605/50000
 - 105s - loss: 0.5067 - acc: 0.7321 - val_loss: 0.4732 - val_acc: 0.7517

Epoch 00605: val_loss 

 - 108s - loss: 0.5039 - acc: 0.7283 - val_loss: 0.4808 - val_acc: 0.7451

Epoch 00662: val_loss did not improve
Epoch 663/50000
 - 107s - loss: 0.5112 - acc: 0.7245 - val_loss: 0.4789 - val_acc: 0.7417

Epoch 00663: val_loss did not improve
Epoch 664/50000
 - 109s - loss: 0.5029 - acc: 0.7303 - val_loss: 0.4598 - val_acc: 0.7627

Epoch 00664: val_loss did not improve
Epoch 665/50000
 - 108s - loss: 0.5069 - acc: 0.7299 - val_loss: 0.4771 - val_acc: 0.7565

Epoch 00665: val_loss did not improve
Epoch 666/50000
 - 106s - loss: 0.5024 - acc: 0.7337 - val_loss: 0.4657 - val_acc: 0.7589

Epoch 00666: val_loss did not improve
Epoch 667/50000
 - 107s - loss: 0.5008 - acc: 0.7350 - val_loss: 0.4652 - val_acc: 0.7597

Epoch 00667: val_loss did not improve
Epoch 668/50000
 - 105s - loss: 0.4960 - acc: 0.7369 - val_loss: 0.4690 - val_acc: 0.7541

Epoch 00668: val_loss did not improve
Epoch 669/50000
 - 110s - loss: 0.4975 - acc: 0.7352 - val_loss: 0.4645 - val_acc: 0.7590

Epoch 00669: val_loss 

KeyboardInterrupt: 

Neural Network Model Predictions

In [81]:
nn_preds = nn_model.predict(x_test)
nn_preds = np.array([1 if x > 0.5 else 0 for x in nn_preds]).astype('int64')

In [82]:
print(classification_report(y_test, nn_preds))

             precision    recall  f1-score   support

          0       0.73      0.67      0.70      8980
          1       0.69      0.75      0.72      9020

avg / total       0.71      0.71      0.71     18000



### Accuracy scores for all 4 models

In [83]:
y_test = np.array(y_test)

print('Logistic Regression Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, log_reg_preds), f1_score(y_test, log_reg_preds), roc_auc_score(y_test, log_reg_preds)))
print('Random Forest Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, rf_preds), f1_score(y_test, rf_preds), roc_auc_score(y_test, rf_preds)))
print('XGBoost Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, xgb_preds), f1_score(y_test, xgb_preds), roc_auc_score(y_test, xgb_preds)))
print('Neural Network Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, nn_preds), f1_score(y_test, nn_preds), roc_auc_score(y_test, nn_preds)))

Logistic Regression Model F1 Score and AUC: 0.594167, 0.558637, 0.594348
Random Forest Model F1 Score and AUC: 0.895389, 0.898430, 0.895327
XGBoost Model F1 Score and AUC: 0.797222, 0.777385, 0.797424
Neural Network Model F1 Score and AUC: 0.708278, 0.719932, 0.708189


### Predicting on test set using best model

Loading submission dataset and filter to 57 features

In [84]:
submission_data = pd.read_csv('data/hackathon_IoT_validation_set_based_on_01mar2017_ANONYMIZED.csv')
submission_data_filtered = submission_data.filter(items=trimmed_feats_list, axis='columns')

Training KNN Model first to obtain distances

In [88]:
knn_x, knn_y = filtered_df.iloc[:, :-1].sample(200000), filtered_df.iloc[:, -1].sample(200000)
real_knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=20)
real_knn.fit(knn_x, knn_y)
sub_output = real_knn.kneighbors(submission_data_filtered, n_neighbors=10, return_distance=True)

Obtaining engineered features from submission dataset

In [94]:
sub_distance_df = pd.DataFrame([[distance for distance in row] for row in sub_output[0]])
sub_sd = sub_distance_df.std(axis=1, numeric_only=True)
sub_min = sub_distance_df.min(axis=1, numeric_only=True)
sub_mean = sub_distance_df.mean(axis=1, numeric_only=True)
sub_2_smallest_dist_df = pd.DataFrame(np.sort(sub_distance_df.values))
sub_2nd_smallest_dist_df = sub_2_smallest_dist_df.iloc[:, -2]
sub_smallest_dist_df = sub_2_smallest_dist_df.iloc[:, -1]

sub_distance_df['sd'] = sub_sd
sub_distance_df['sem'] = sub_distance_df.sem(axis=1, numeric_only=True)
sub_distance_df['var'] = sub_distance_df.var(axis=1, numeric_only=True)
sub_distance_df['skew'] = sub_distance_df.skew(axis=1, numeric_only=True)
sub_distance_df['kurt'] = sub_distance_df.kurt(axis=1, numeric_only=True)
sub_distance_df['mad'] = sub_distance_df.mad(axis=1)
sub_distance_df['shortest_dist_sd'] = sub_smallest_dist_df / sub_sd
sub_distance_df['shortest_dist_diff'] = (sub_mean - sub_min) / sub_sd
sub_distance_df['sd_diff_smallest2next'] = (sub_2nd_smallest_dist_df - sub_min) / sub_sd 

In [95]:
sub_distance_df = sub_distance_df.replace([np.inf, -np.inf], np.nan)
sub_distance_df = sub_distance_df.replace(np.nan, 0)

In [160]:
bst.feature_names

['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8']

In [161]:
known_sub_x = sub_distance_df.filter(items=training_cols, axis='columns')

# additional preprocessing for xgboost
sub_x = known_sub_x
sub_x.columns = bst.feature_names
sub_x = xgb.DMatrix(sub_x, label=np.zeros((sub_x.shape[0], 1)))

# known_preds = log_reg.predict(known_sub_x)
# known_preds = rf.predict(known_sub_x)
known_preds = bst.predict(sub_x)
# known_preds = nn_model.predict(known_sub_x)

In [162]:
known_preds

array([0.5200784 , 0.57257104, 0.5200784 , 0.5200784 , 0.5200784 ,
       0.660773  , 0.660773  , 0.660773  , 0.65189266, 0.660773  ,
       0.49470606, 0.56060416, 0.660773  , 0.65189266, 0.5200784 ,
       0.49470606, 0.660773  , 0.49470606, 0.5200784 , 0.660773  ,
       0.5200784 , 0.40510288, 0.660773  , 0.65189266, 0.49470606,
       0.65189266, 0.65189266, 0.49470606, 0.57257104, 0.5200784 ,
       0.65189266, 0.49470606, 0.57257104, 0.65189266, 0.5200784 ,
       0.65189266, 0.660773  , 0.660773  , 0.5200784 , 0.65189266,
       0.660773  , 0.65189266, 0.5200784 , 0.56060416, 0.660773  ,
       0.5200784 , 0.65189266, 0.5200784 , 0.5200784 , 0.660773  ,
       0.56060416, 0.56060416, 0.49470606, 0.65189266, 0.5200784 ,
       0.65189266, 0.5200784 , 0.65189266, 0.5200784 , 0.5200784 ,
       0.5200784 , 0.49470606, 0.5200784 , 0.6612021 , 0.5200784 ,
       0.65189266, 0.5200784 , 0.6271014 , 0.5200784 , 0.660773  ,
       0.57257104, 0.5200784 , 0.5200784 , 0.65189266, 0.66077

In [163]:
known_preds_list = known_preds.tolist()


# log reg model output conversion
# No additional steps required

# rf model output conversion
# No additional steps required

# xgb model output conversion
known_preds_list = [1 if x > 0.5 else 0 for x in known_preds_list]

# nn model output conversion
# known_preds_list = [1 if x[0] > 0.5 else 0 for x in known_preds_list]


formatted_preds = pd.Series(['unknown' if row == 0 else None for row in known_preds_list])

In [164]:
model1_data_filtered = pd.concat([submission_data_filtered, formatted_preds], axis=1, )

In [165]:
model1_data_filtered.head()

Unnamed: 0,ttl_avg,domain_is_samsung,http_dom_host_alexaRank,http_has_location,packet_inter_arrivel_avg,packet_inter_arrivel_firstQ,packet_inter_arrivel_max,packet_inter_arrivel_A_avg,packet_inter_arrivel_stdev,packet_inter_arrivel_median,...,packet_size_A_min,ttl_thirdQ,B_is_dynamic_and_or_private_port,packet_inter_arrivel_A_entropy,domain_is_dlink,packet_inter_arrivel_A_median,suffix_is_cloudfront.net,http_cookie_values_entropy,http_inter_arrivel_stdev,0
0,58.518519,0,2147483647,0,4.653678,0.015224,37.375881,7.562226,9.632666,0.176993,...,0,64.0,0,4.0,0,0.430734,0,0,0.0,
1,59.375,1,295,1,0.019548,0.002045,0.060264,0.034209,0.021989,0.014985,...,0,64.0,0,2.0,0,0.028752,0,0,0.0,
2,129.902439,0,2147483647,0,0.04789,7.8e-05,0.269578,0.087072,0.088786,0.003968,...,0,225.0,0,4.368523,0,0.018532,0,0,0.0,
3,255.0,0,2147483647,0,140.821363,0.499576,1828.971601,140.821363,470.474201,0.500134,...,0,255.0,0,3.906891,0,0.500134,0,0,0.0,
4,255.0,0,2147483647,0,217.456576,0.498901,2208.373405,217.456576,662.483985,0.499896,...,0,255.0,0,3.459432,0,0.499896,0,0,0.0,


In [166]:
model1_data_filtered.iloc[:, -1].value_counts()

unknown    100
Name: 0, dtype: int64

In [111]:
joblib.dump(model1_data_filtered, 'unknown_classified_data.pkl', compress=9)

['unknown_classified_data.pkl']