In [2]:
import os
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.externals import joblib
from sklearn.utils import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, LabelEncoder
from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from keras import Sequential
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dropout, Dense
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Load cleaned data from pickle file

In [3]:
cleaned_df = joblib.load('pickles/cleaned_df.pkl')

Load top 50 features and extract top 50 features into a list

In [4]:
trimmed_feats_list = joblib.load('pickles/57var.pkl')
trimmed_feats_list.append('device_category')

Filter dataset to only use the 57 relevant features

In [5]:
filtered_df = cleaned_df.filter(items=trimmed_feats_list, axis='columns')
filtered_df.shape

(399348, 57)

Obtain list of unique classes and create list of names of feature engineered columns

In [6]:
distinct_classes = filtered_df.device_category.unique()
training_cols = ['sd', 'sem', 'var', 'skew', 'kurt', 'mad', 'shortest_dist_sd', 'shortest_dist_diff', 'sd_diff_smallest2next', 'known']

Feature Engineering Function

In [7]:
def dist_engineered_df(knn_distance_output, training=False, label=0):
    distance_df = pd.DataFrame([[distance for distance in row] for row in knn_distance_output[0]])
    output_df = pd.DataFrame()
    
    sd = distance_df.std(axis=1, numeric_only=True)
    min = distance_df.min(axis=1, numeric_only=True)
    mean = distance_df.mean(axis=1, numeric_only=True)
    two_smallest_dist_df = pd.DataFrame(np.sort(distance_df.values))
    second_smallest_dist_df = two_smallest_dist_df.iloc[:, -2]
    smallest_dist_df = two_smallest_dist_df.iloc[:, -1]
    
    output_df['sd'] = sd
    output_df['sem'] = distance_df.sem(axis=1, numeric_only=True)
    output_df['var'] = distance_df.var(axis=1, numeric_only=True)
    output_df['skew'] = distance_df.skew(axis=1, numeric_only=True)
    output_df['kurt'] = distance_df.kurt(axis=1, numeric_only=True)
    output_df['mad'] = distance_df.mad(axis=1)
    output_df['shortest_dist_sd'] = smallest_dist_df / sd
    output_df['shortest_dist_diff'] = (mean - min) / sd
    output_df['sd_diff_smallest2next'] = (second_smallest_dist_df - min) / sd
    
    if training:
        output_df['known'] = pd.Series([label for _ in range(len(output_df.index))])
    
    return output_df

Build training dataset for known/unknown detection classifier

In [79]:
training_df = pd.DataFrame()
knn = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=30)

for out_class in distinct_classes:
    nine_classes_list = distinct_classes.tolist()
    nine_classes_list.remove(out_class)
    
    in_classes_samples = pd.DataFrame()
    nine_classes_df = filtered_df[filtered_df['device_category'] != out_class]
    out_class_df = filtered_df[filtered_df['device_category'] == out_class]
    
    for in_class in nine_classes_list:
        in_class_df = filtered_df[filtered_df['device_category'] == in_class]
        in_class_sample = in_class_df.sample(250, random_state=1, axis=0, replace=True)
        index_to_drop = in_class_sample.index.tolist()
        nine_classes_df = nine_classes_df.drop(index_to_drop)
        in_class_sample = in_class_sample.reset_index()
        in_classes_samples = pd.concat([in_classes_samples, in_class_sample], axis=0, ignore_index=True)
    
    train_x, _, train_y, _ = train_test_split(nine_classes_df.iloc[:, :-1], nine_classes_df.iloc[:, -1], train_size=90000, 
                                              test_size=9, random_state=1, shuffle=True, stratify=nine_classes_df.iloc[:, -1])
    knn.fit(train_x, train_y)
    
    val_x, val_y = in_classes_samples.iloc[:, 1:-1], in_classes_samples.iloc[:, -1]
    val_output = knn.kneighbors(val_x, n_neighbors=9, return_distance=True)
    val_distance_df = dist_engineered_df(val_output, training=True, label=1)
        
    out_class_df = out_class_df.sample(n=250, random_state=1, axis=0, replace=True)
    test_x, test_y = out_class_df.iloc[:, :-1], out_class_df.iloc[:, -1]
    test_output = knn.kneighbors(test_x, n_neighbors=9, return_distance=True)
    test_distance_df = dist_engineered_df(test_output, training=True, label=0)
    
    combined_df = pd.concat([val_distance_df, test_distance_df], axis=0, ignore_index=True).filter(items=training_cols, axis='columns')
    training_df = pd.concat([training_df, combined_df], axis=0, ignore_index=True)

Check counts of labels

In [80]:
training_df['known'].value_counts()

1    22500
0     2500
Name: known, dtype: int64

Clean up the dataset

In [81]:
training_df = training_df.replace([np.inf, -np.inf], np.nan)
training_df = training_df.replace(np.nan, 0)

Save the labelled dataset obtained from KNN models

In [82]:
joblib.dump(training_df, 'pickles/known_classifier_dataset_v5.pkl', compress=9)

['pickles/known_classifier_dataset_v5.pkl']

Load the labelled dataset obtained from KNN models

In [83]:
training_df = joblib.load('pickles/known_classifier_dataset_v5.pkl')
training_df.head()

Unnamed: 0,sd,sem,var,skew,kurt,mad,shortest_dist_sd,shortest_dist_diff,sd_diff_smallest2next,known
0,0.000698,0.000233,4.865071e-07,-0.12733,-0.868442,0.000558,6.112792,1.655023,2.788364,1
1,0.000468,0.000156,2.190185e-07,-0.634079,0.131796,0.000353,5.778269,1.891482,2.888499,1
2,0.000264,8.8e-05,6.987286e-08,0.516662,-1.500261,0.000223,7.378833,1.111323,2.235341,1
3,0.000296,9.9e-05,8.745665e-08,-0.651498,-0.340732,0.000235,5.670681,1.841909,2.874019,1
4,5.617791,1.872597,31.55958,0.075837,-1.263892,4.633068,5.432344,1.436004,2.600108,1


### Prepare feature engineered dataset, splitting to train-test using 80-20 ratio
Scale using standard scaling so that models are able to better learn from dataset

In [109]:
x, y = training_df.drop('known', axis=1), training_df['known']
class_weights = compute_class_weight('balanced', np.unique(y), y)
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

### Logistic Regression Model for KnownUnknown Class Detection
#### Using Stochastic Average Gradient for negate the large dataset impact on training time

In [110]:
log_reg = LogisticRegressionCV(cv=10, dual=False, penalty='l2', scoring='accuracy', solver='sag', max_iter=1500, n_jobs=-1, 
                               verbose=2, multi_class='ovr', random_state=5)
log_reg.fit(x_train, y_train)

convergence after 37 epochs took 3 seconds
convergence after 37 epochs took 3 seconds
convergence after 37 epochs took 3 seconds
convergence after 38 epochs took 3 seconds
convergence after 60 epochs took 4 seconds
convergence after 60 epochs took 4 seconds
convergence after 61 epochs took 4 seconds
convergence after 61 epochs took 4 seconds
convergence after 277 epochs took 19 seconds
convergence after 279 epochs took 20 seconds
convergence after 281 epochs took 20 seconds
convergence after 281 epochs took 20 seconds
convergence after 840 epochs took 58 seconds
convergence after 843 epochs took 59 seconds
convergence after 855 epochs took 59 seconds
convergence after 895 epochs took 62 seconds
max_iter reached after 110 seconds




max_iter reached after 110 seconds
max_iter reached after 111 seconds
max_iter reached after 110 seconds
max_iter reached after 110 seconds
max_iter reached after 111 seconds
max_iter reached after 112 seconds
max_iter reached after 111 seconds
max_iter reached after 111 seconds
max_iter reached after 111 seconds
max_iter reached after 111 seconds
max_iter reached after 113 seconds
max_iter reached after 112 seconds
max_iter reached after 113 seconds
max_iter reached after 112 seconds
max_iter reached after 114 seconds
max_iter reached after 112 seconds
max_iter reached after 112 seconds
max_iter reached after 112 seconds
max_iter reached after 113 seconds
convergence after 212 epochs took 16 seconds
convergence after 38 epochs took 3 seconds
convergence after 60 epochs took 4 seconds
convergence after 289 epochs took 22 seconds
convergence after 218 epochs took 16 seconds
convergence after 35 epochs took 3 seconds
convergence after 38 epochs took 3 seconds
convergence after 60 epochs 

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 25.8min finished


convergence after 40 epochs took 1 seconds


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1500,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=5,
           refit=True, scoring='accuracy', solver='sag', tol=0.0001,
           verbose=2)

Logistic Regression Model Predictions

In [111]:
log_reg_preds = log_reg.predict(x_test)
print(classification_report(y_test, log_reg_preds))

             precision    recall  f1-score   support

          0       1.00      0.00      0.00       500
          1       0.90      1.00      0.95      4500

avg / total       0.91      0.90      0.85      5000



### Random Forest Model for KnownUnknown Class Detection
#### Using 1000 Decision Trees, Bagging and balancing class weights

In [112]:
rf = RandomForestClassifier(1000, criterion='gini', oob_score=True, n_jobs=-1,
                            random_state=5, class_weight='balanced', verbose=1)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   32.4s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=True, random_state=5,
            verbose=1, warm_start=False)

Random Forest Model Predictions

In [113]:
rf_preds = rf.predict(x_test)
print(classification_report(y_test, rf_preds))

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.0s finished


             precision    recall  f1-score   support

          0       0.65      0.36      0.47       500
          1       0.93      0.98      0.96      4500

avg / total       0.90      0.92      0.91      5000



### XGBoost Model for KnownUnknown Class Detection

In [114]:
dtrain = xgb.DMatrix(x_train, label=np.array(y_train))
dtest = xgb.DMatrix(x_test, label=np.array(y_test))

In [115]:
param = {
    'max_depth': 10,
    'eta': 0.1,
    'silent': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}
num_round = 1000
early_stop = 100

In [116]:
bst = xgb.train(param, dtrain, num_round, evals=[(dtest,'eval')], verbose_eval=100, 
                early_stopping_rounds=early_stop)

[0]	eval-error:0.0686
Will train until eval-error hasn't improved in 100 rounds.
[100]	eval-error:0.0632
Stopping. Best iteration:
[17]	eval-error:0.0624



In [117]:
xgb_prob_preds = bst.predict(dtest)
xgb_preds = np.asarray([1 if line > 0.5 else 0 for line in xgb_prob_preds]).astype('int64')
print(classification_report(y_test, xgb_preds))

             precision    recall  f1-score   support

          0       0.89      0.42      0.57       500
          1       0.94      0.99      0.97      4500

avg / total       0.93      0.94      0.93      5000



### Neural Network Model for KnownUnknown Class Detection (3 Hidden Layers)
#### Using relu as activation functions for hidden layers due to its robustness and the absence of vanishing gradient problem when using it. Sigmoid is used as the activation function for the output as it is a binary classification problem.

#### Loss function of binary crossentropy is used as it is a binary classification problem.

In [118]:
model_name = 'known_classifier_nn.h5'

nn_model = Sequential()

nn_model.add(Dense(256, activation='relu', input_shape=tuple(x.shape[1:])))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(512, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(256, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))

op = optimizers.Adam(lr=0.001)

nn_model.compile(optimizer=op, metrics=['accuracy'], loss='binary_crossentropy')

save_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1)
early_stop = EarlyStopping(min_delta=0.005, patience=100, verbose=1, mode='min')

nn_model.fit(x_train, y_train, epochs=10000, batch_size=128, verbose=2, 
             class_weight=class_weights, shuffle=True, validation_split=0.2, callbacks=[save_checkpoint, early_stop])

Train on 16000 samples, validate on 4000 samples
Epoch 1/10000
 - 4s - loss: 0.3465 - acc: 0.8973 - val_loss: 0.3311 - val_acc: 0.8958

Epoch 00001: val_loss improved from inf to 0.33109, saving model to known_classifier_nn.h5
Epoch 2/10000
 - 3s - loss: 0.3240 - acc: 0.9011 - val_loss: 0.3191 - val_acc: 0.8958

Epoch 00002: val_loss improved from 0.33109 to 0.31908, saving model to known_classifier_nn.h5
Epoch 3/10000
 - 4s - loss: 0.3206 - acc: 0.9011 - val_loss: 0.3174 - val_acc: 0.8958

Epoch 00003: val_loss improved from 0.31908 to 0.31736, saving model to known_classifier_nn.h5
Epoch 4/10000
 - 3s - loss: 0.3214 - acc: 0.9012 - val_loss: 0.3170 - val_acc: 0.8958

Epoch 00004: val_loss improved from 0.31736 to 0.31704, saving model to known_classifier_nn.h5
Epoch 5/10000
 - 3s - loss: 0.3183 - acc: 0.9009 - val_loss: 0.3192 - val_acc: 0.8958

Epoch 00005: val_loss did not improve from 0.31704
Epoch 6/10000
 - 3s - loss: 0.3153 - acc: 0.9011 - val_loss: 0.3147 - val_acc: 0.8970

Ep

<keras.callbacks.History at 0x2380df20be0>

Neural Network Model Predictions

In [119]:
nn_preds = nn_model.predict(x_test)
nn_preds = np.array([1 if x > 0.5 else 0 for x in nn_preds]).astype('int64')
print(classification_report(y_test, nn_preds))

             precision    recall  f1-score   support

          0       0.79      0.20      0.32       500
          1       0.92      0.99      0.95      4500

avg / total       0.91      0.92      0.89      5000



### Accuracy scores for all 4 models

In [120]:
y_test = np.array(y_test)

print('Logistic Regression Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, log_reg_preds), f1_score(y_test, log_reg_preds), roc_auc_score(y_test, log_reg_preds)))
print('Random Forest Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, rf_preds), f1_score(y_test, rf_preds), roc_auc_score(y_test, rf_preds)))
print('XGBoost Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, xgb_preds), f1_score(y_test, xgb_preds), roc_auc_score(y_test, xgb_preds)))
print('Neural Network Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, nn_preds), f1_score(y_test, nn_preds), roc_auc_score(y_test, nn_preds)))

Logistic Regression Model F1 Score and AUC: 0.900200, 0.947468, 0.501000
Random Forest Model F1 Score and AUC: 0.917000, 0.955004, 0.670333
XGBoost Model F1 Score and AUC: 0.936800, 0.965889, 0.707111
Neural Network Model F1 Score and AUC: 0.915000, 0.954647, 0.599000


### Predicting on test set using best model

Loading submission dataset and filter to 57 features

In [121]:
submission_data = pd.read_csv('data/hackathon_IoT_validation_set_based_on_01mar2017_ANONYMIZED.csv')
submission_data_filtered = submission_data.filter(items=trimmed_feats_list, axis='columns')

Training KNN Model first to obtain distances

In [22]:
knn_x, knn_y = filtered_df.iloc[:, :-1], filtered_df.iloc[:, -1]
real_knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=20)
real_knn.fit(knn_x, knn_y)
sub_output = real_knn.kneighbors(submission_data_filtered, n_neighbors=10, return_distance=True)

Obtaining engineered features from submission dataset

In [122]:
sub_distance_df = dist_engineered_df(sub_output)
sub_distance_df = sub_distance_df.replace([np.inf, -np.inf, np.nan], 0)

Using XGBoost model as it has the highest AUC value amongst the 4 models for novelty prediction

In [123]:
known_sub_x = sub_distance_df.filter(items=training_cols, axis='columns')

# additional preprocessing for xgboost
sub_x = known_sub_x
sub_x.columns = bst.feature_names
sub_x = xgb.DMatrix(sub_x, label=np.zeros((sub_x.shape[0], 1)))

# log_reg_known_preds = log_reg.predict(known_sub_x)
# rf_known_preds = rf.predict(known_sub_x)
xgb_known_preds = bst.predict(sub_x)
# nn_known_preds = nn_model.predict(known_sub_x)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.1s finished


In [124]:
# log_reg_known_preds_list = log_reg_known_preds.tolist()
# rf_known_preds_list = rf_known_preds.tolist()
xgb_known_preds_list = xgb_known_preds.tolist()
# nn_known_preds_list = nn_known_preds.tolist()

# log reg model output conversion
# No additional steps required

# rf model output conversion
# No additional steps required

# xgb model output conversion
xgb_known_preds_list = [1 if x > 0.5 else 0 for x in xgb_known_preds_list]

# nn model output conversion
# nn_known_preds_list = [1 if x[0] > 0.5 else 0 for x in nn_known_preds_list]

# log_reg_formatted_preds = pd.Series(['unknown' if row == 0 else None for row in log_reg_known_preds_list])
# rf_formatted_preds = pd.Series(['unknown' if row == 0 else None for row in rf_known_preds_list])
xgb_formatted_preds = pd.Series(['unknown' if row == 0 else None for row in xgb_known_preds_list])
# nn_formatted_preds = pd.Series(['unknown' if row == 0 else None for row in nn_known_preds_list])

In [125]:
# log_reg_data_filtered = pd.concat([submission_data_filtered, log_reg_formatted_preds], axis=1)
# rf_data_filtered = pd.concat([submission_data_filtered, rf_formatted_preds], axis=1)
xgb_data_filtered = pd.concat([submission_data_filtered, xgb_formatted_preds], axis=1)
# nn_data_filtered = pd.concat([submission_data_filtered, nn_formatted_preds], axis=1)

In [128]:
xgb_data_filtered.iloc[:, -1].value_counts()

unknown    141
Name: 0, dtype: int64

In [130]:
joblib.dump(xgb_data_filtered, 'unknown_classified_data.pkl', compress=9)

['unknown_classified_data.pkl']