In [105]:
import os
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.externals import joblib
from sklearn.utils import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, LabelEncoder
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from keras import Sequential
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dropout, Dense
from keras.models import load_model

Load cleaned data from pickle file

In [2]:
cleaned_df = joblib.load('pickles/cleaned_df.pkl')

Load top 50 features and extract top 50 features into a list

In [3]:
trimmed_feats_list = joblib.load('pickles/57var.pkl')
trimmed_feats_list.append('device_category')

Filter dataset to only use the 57 relevant features

In [4]:
filtered_df = cleaned_df.filter(items=trimmed_feats_list, axis='columns')

In [5]:
filtered_df.shape

(399348, 57)

Obtain list of unique classes and create list of names of feature engineered columns

In [6]:
distinct_classes = filtered_df.device_category.unique()
training_cols = ['sd', 'sem', 'var', 'skew', 'kurt', 'mad', 'shortest_dist_sd', 'sd_diff_smallest2next', 'known']

Build training dataset for known/unknown detection classifier

In [None]:
training_df = pd.DataFrame()
knn = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=30)

for out_class in distinct_classes:
    nine_classes_list = distinct_classes.tolist()
    nine_classes_list.remove(out_class)
    
    in_classes_samples = pd.DataFrame()
    nine_classes_df = filtered_df[filtered_df['device_category'] != out_class]
    out_class_df = filtered_df[filtered_df['device_category'] == out_class]
    
    for in_class in nine_classes_list:
        in_class_df = filtered_df[filtered_df['device_category'] == in_class]
        in_class_sample = in_class_df.sample(50)
        index_to_drop = in_class_sample.index.tolist()
        nine_classes_df = nine_classes_df.drop(index_to_drop)
        in_class_sample = in_class_sample.reset_index()
        in_classes_samples = pd.concat([in_classes_samples, in_class_sample], axis=0, ignore_index=True)
    
    train_x, _, train_y, _ = train_test_split(nine_classes_df.iloc[:, :-1], nine_classes_df.iloc[:, -1], train_size=900, 
                                              test_size=9, random_state=1, shuffle=True, stratify=nine_classes_df.iloc[:, -1])
    knn.fit(train_x, train_y)
    
    val_x, val_y = in_classes_samples.iloc[:, 1:-1], in_classes_samples.iloc[:, -1]
    val_output = knn.kneighbors(val_x, n_neighbors=9, return_distance=True)
    val_distance_df = pd.DataFrame([[distance for distance in row] for row in val_output[0]])
    val_sd = val_distance_df.std(axis=1, numeric_only=True)
    val_min = val_distance_df.min(axis=1, numeric_only=True)
    val_mean = val_distance_df.mean(axis=1, numeric_only=True)
    val_2nd_smallest_dist_df = pd.DataFrame(np.sort(val_distance_df.values))
    val_2nd_smallest_dist_df = val_2nd_smallest_dist_df.iloc[:, -2]
    
    val_distance_df['sd'] = val_sd
    val_distance_df['sem'] = val_distance_df.sem(axis=1, numeric_only=True)
    val_distance_df['var'] = val_distance_df.var(axis=1, numeric_only=True)
    val_distance_df['skew'] = val_distance_df.skew(axis=1, numeric_only=True)
    val_distance_df['kurt'] = val_distance_df.kurt(axis=1, numeric_only=True)
    val_distance_df['mad'] = val_distance_df.mad(axis=1)
    val_distance_df['shortest_dist_sd'] = (val_mean - val_min) / val_sd
    val_distance_df['sd_diff_smallest2next'] = (val_2nd_smallest_dist_df - val_min) / val_sd 
    val_distance_df['known'] = pd.Series([1 for _ in range(len(val_distance_df.index))]) 
    
    
    out_class_df = out_class_df.sample(n=450, random_state=1, axis=0, replace=True)
    test_x, test_y = out_class_df.iloc[:, :-1], out_class_df.iloc[:, -1]
    test_output = knn.kneighbors(test_x, n_neighbors=9, return_distance=True)
    
    test_distance_df = pd.DataFrame([[distance for distance in row] for row in test_output[0]])
    test_sd = test_distance_df.std(axis=1, numeric_only=True)
    test_min = test_distance_df.min(axis=1, numeric_only=True)
    test_mean = test_distance_df.mean(axis=1, numeric_only=True)
    test_2nd_smallest_dist_df = pd.DataFrame(np.sort(test_distance_df.values))
    test_2nd_smallest_dist_df = test_2nd_smallest_dist_df.iloc[:, -2]
    
    test_distance_df['sd'] = test_sd
    test_distance_df['sem'] = test_distance_df.sem(axis=1, numeric_only=True)
    test_distance_df['var'] = test_distance_df.var(axis=1, numeric_only=True)
    test_distance_df['skew'] = test_distance_df.skew(axis=1, numeric_only=True)
    test_distance_df['kurt'] = test_distance_df.kurt(axis=1, numeric_only=True)
    test_distance_df['mad'] = test_distance_df.mad(axis=1)
    test_distance_df['shortest_dist_sd'] = (test_mean - test_min) / test_sd
    test_distance_df['sd_diff_smallest2next'] = (test_2nd_smallest_dist_df - test_min) / test_sd 
    test_distance_df['known'] = pd.Series([0 for _ in range(len(test_distance_df.index))]) 
    
    
    combined_df = pd.concat([val_distance_df, test_distance_df], axis=0, ignore_index=True).filter(items=training_cols, axis='columns')
    training_df = pd.concat([training_df, combined_df], axis=0, ignore_index=True)

Check distribution of labels

In [None]:
training_df['known'].value_counts()

Clean up the dataset

In [None]:
training_df = training_df.replace([np.inf, -np.inf], np.nan)
training_df = training_df.replace(np.nan, 0)

Save the labelled dataset obtained from KNN models

In [None]:
joblib.dump(training_df, 'pickles/known_classifier_dataset_v2.pkl', compress=9)

Load the labelled dataset obtained from KNN models

In [7]:
training_df = joblib.load('pickles/known_classifier_dataset_v2.pkl')

### Prepare feature engineered dataset, splitting to train-test using 80-20 ratio
Scale using standard scaling so that models are able to better learn from dataset

In [8]:
x, y = training_df.drop('known', axis=1), training_df['known']
class_weights = compute_class_weight('balanced', np.unique(y), y)
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### Logistic Regression Model for KnownUnknown Class Detection
#### Using Stochastic Average Gradient for negate the large dataset impact on training time

In [9]:
log_reg = LogisticRegression(penalty='l2', dual=False, class_weight='balanced', random_state=5, max_iter=2000, 
                             multi_class='ovr', verbose=1, n_jobs=-1, solver='sag')
log_reg.fit(x_train, y_train)

convergence after 123 epochs took 0 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.7s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=2000,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=5,
          solver='sag', tol=0.0001, verbose=2, warm_start=False)

Logistic Regression Model Predictions

In [10]:
log_reg_preds = log_reg.predict(x_test)

Classification Report to validate model performance

In [11]:
print(classification_report(y_test, log_reg_preds))

             precision    recall  f1-score   support

          0       0.56      0.67      0.61       894
          1       0.60      0.49      0.54       906

avg / total       0.58      0.58      0.57      1800



### Random Forest Model for KnownUnknown Class Detection
#### Using 1000 Decision Trees, Bagging and balancing class weights

In [12]:
rf = RandomForestClassifier(1000, criterion='gini', oob_score=True, n_jobs=-1,
                            random_state=5, class_weight='balanced', verbose=1)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    6.0s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=True, random_state=5,
            verbose=1, warm_start=False)

Random Forest Model Predictions

In [13]:
rf_preds = rf.predict(x_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.4s finished


In [14]:
print(classification_report(y_test, rf_preds))

             precision    recall  f1-score   support

          0       0.89      0.74      0.81       894
          1       0.78      0.91      0.84       906

avg / total       0.83      0.82      0.82      1800



### XGBoost Model for KnownUnknown Class Detection

In [19]:
dtrain = xgb.DMatrix(x_train, label=np.array(y_train))
dtest = xgb.DMatrix(x_test, label=np.array(y_test))

In [116]:
param = {
    'max_depth': 10,
    'eta': 0.1,
    'silent': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}
num_round = 1000

In [67]:
cv = xgb.cv(param, dtrain, num_round, folds=5, stratified=True, metrics='error', verbose_eval=1, seed=42, shuffle=True)
cv

[0]	train-error:0.17625+0.00233854	test-error:0.238056+0.00540402
[1]	train-error:0.174583+0.00172609	test-error:0.237639+0.00510673
[2]	train-error:0.175139+0.00215375	test-error:0.2375+0.00450057
[3]	train-error:0.174791+0.00230112	test-error:0.238195+0.00501556
[4]	train-error:0.174791+0.00206239	test-error:0.238472+0.00538285
[5]	train-error:0.175+0.0037071	test-error:0.236945+0.00665207
[6]	train-error:0.174653+0.00398666	test-error:0.236945+0.00665207
[7]	train-error:0.173194+0.00202971	test-error:0.235694+0.00594482
[8]	train-error:0.171389+0.00475163	test-error:0.23+0.0116268
[9]	train-error:0.165486+0.00413993	test-error:0.225833+0.0121385
[10]	train-error:0.165139+0.00478827	test-error:0.225139+0.0124737
[11]	train-error:0.164792+0.00417041	test-error:0.22375+0.0124164
[12]	train-error:0.165694+0.00383521	test-error:0.225694+0.0126669
[13]	train-error:0.16375+0.00478997	test-error:0.223472+0.0120251
[14]	train-error:0.162778+0.00587359	test-error:0.223195+0.0122538
[15]	train

In [117]:
bst = xgb.train(param, dtrain, num_round)

In [122]:
xgb_prob_preds = bst.predict(dtest)
xgb_preds = np.asarray([1 if line > 0.5 else 0 for line in xgb_prob_preds]).astype('int64')

In [64]:
print(classification_report(y_test, rf_preds))

             precision    recall  f1-score   support

          0       0.89      0.74      0.81       894
          1       0.78      0.91      0.84       906

avg / total       0.83      0.82      0.82      1800



### Neural Network Model for KnownUnknown Class Detection (3 Hidden Layers)
#### Using relu as activation functions for hidden layers due to its robustness and the absence of vanishing gradient problem when using it. Sigmoid is used as the activation function for the output as it is a binary classification problem.

#### Loss function of binary crossentropy is used as it is a binary problem.

In [None]:
model_name = 'known_classifier_nn.h5'

nn_model = Sequential()

nn_model.add(Dense(500, activation='relu', input_shape=tuple(x.shape[1:])))
nn_model.add(Dropout(0.3))
nn_model.add(Dense(1000, activation='relu'))
nn_model.add(Dropout(0.3))
nn_model.add(Dense(500, activation='relu'))
nn_model.add(Dropout(0.3))
# nn_model.add(Dense(400, activation='relu'))
# nn_model.add(Dropout(0.2))
# nn_model.add(Dense(400, activation='relu'))
# nn_model.add(Dropout(0.2))
nn_model.add(Dense(1, activation='sigmoid'))

op = optimizers.Adam(lr=0.001)

nn_model.compile(optimizer=op, metrics=['accuracy'], loss='binary_crossentropy')

save_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1)
early_stop = EarlyStopping(min_delta=0.01, patience=500, verbose=1, mode='min')

nn_model.fit(x_train, y_train, epochs=50000, batch_size=16, verbose=2, 
             class_weight=class_weights, shuffle=True, validation_split=0.2, callbacks=[save_checkpoint, early_stop])

Neural Network Model Predictions

In [85]:
nn_preds = nn_model.predict(x_test)
nn_preds = np.array([1 if x > 0.5 else 0 for x in nn_preds]).astype('int64')

In [77]:
print(classification_report(y_test, nn_preds))

             precision    recall  f1-score   support

          0       0.76      0.70      0.73       894
          1       0.72      0.78      0.75       906

avg / total       0.74      0.74      0.74      1800



### Accuracy scores for all 4 models

In [124]:
y_test = np.array(y_test)

print('Logistic Regression Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, log_reg_preds), f1_score(y_test, log_reg_preds), roc_auc_score(y_test, log_reg_preds)))
print('Random Forest Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, rf_preds), f1_score(y_test, rf_preds), roc_auc_score(y_test, rf_preds)))
print('XGBoost Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, xgb_preds), f1_score(y_test, xgb_preds), roc_auc_score(y_test, xgb_preds)))
print('Neural Network Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, nn_preds), f1_score(y_test, nn_preds), roc_auc_score(y_test, nn_preds)))

Logistic Regression Model F1 Score and AUC: 0.576667, 0.535932, 0.577278
Random Forest Model F1 Score and AUC: 0.822778, 0.837494, 0.822211
XGBoost Model F1 Score and AUC: 0.814444, 0.823467, 0.814140
Neural Network Model F1 Score and AUC: 0.739444, 0.750665, 0.739177


### Predicting on test set using best model

Loading submission dataset and filter to 57 features

In [None]:
submission_data = pd.read_csv('data/hackathon_IoT_validation_set_based_on_01mar2017_ANONYMIZED.csv')
submission_data_filtered = submission_data.filter(items=trimmed_feats_list, axis='columns')

Training KNN Model first to obtain distances

In [None]:
knn_x, knn_y = filtered_df.iloc[:, :-1].sample(10000, filtered_df.iloc[:, -1]
real_knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=20)
real_knn.fit(knn_x, knn_y)
sub_output = real_knn.kneighbors(submission_data_filtered, n_neighbors=10, return_distance=True)

Obtaining engineered features from submission dataset

In [None]:
sub_distance_df = pd.DataFrame([[distance for distance in row] for row in sub_output[0]])
sub_sd = sub_distance_df.std(axis=1, numeric_only=True)
sub_min = sub_distance_df.min(axis=1, numeric_only=True)
sub_mean = sub_distance_df.mean(axis=1, numeric_only=True)
sub_2nd_smallest_dist_df = pd.DataFrame(np.sort(sub_distance_df.values))
sub_2nd_smallest_dist_df = sub_2nd_smallest_dist_df.iloc[:, -2]

sub_distance_df['sd'] = sub_sd
sub_distance_df['sem'] = sub_distance_df.sem(axis=1, numeric_only=True)
sub_distance_df['var'] = sub_distance_df.var(axis=1, numeric_only=True)
sub_distance_df['skew'] = sub_distance_df.skew(axis=1, numeric_only=True)
sub_distance_df['kurt'] = sub_distance_df.kurt(axis=1, numeric_only=True)
sub_distance_df['mad'] = sub_distance_df.mad(axis=1)
sub_distance_df['shortest_dist_sd'] = (sub_mean - sub_min) / sub_sd
sub_distance_df['sd_diff_smallest2next'] = (sub_2nd_smallest_dist_df - sub_min) / sub_sd 
sub_distance_df['known'] = pd.Series([0 for _ in range(len(sub_distance_df.index))]) 

In [None]:
known_sub_x = sub_distance_df.filter(items=training_cols, axis='columns')
known_preds =  rf_model.predict(known_sub_x)

In [None]:
known_preds_list = known_preds.tolist()
formatted_preds = pd.Series(['unknown' if row == 0 else None for row in known_preds_list])

In [None]:
model1_data_filtered = pd.concat([submission_data_filtered, formatted_preds], axis=1, ignore_index=True)

In [None]:
jobllib.dump(model1_data_filtered, 'unknown_classified_data.pkl', compress=9)