In [52]:
import os
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.externals import joblib
from sklearn.utils import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, LabelEncoder
from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from keras import Sequential
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dropout, Dense
from keras.models import load_model

Load cleaned data from pickle file

In [2]:
cleaned_df = joblib.load('pickles/cleaned_df.pkl')

Load top 50 features and extract top 50 features into a list

In [3]:
trimmed_feats_list = joblib.load('pickles/57var.pkl')
trimmed_feats_list.append('device_category')

Filter dataset to only use the 57 relevant features

In [4]:
filtered_df = cleaned_df.filter(items=trimmed_feats_list, axis='columns')
filtered_df.shape

(399348, 57)

Obtain list of unique classes and create list of names of feature engineered columns

In [5]:
distinct_classes = filtered_df.device_category.unique()
training_cols = ['sd', 'sem', 'var', 'skew', 'kurt', 'mad', 'shortest_dist_sd', 'shortest_dist_diff', 'sd_diff_smallest2next', 'known']

Feature Engineering Function

In [13]:
def dist_engineered_df(knn_distance_output, training=False, label=0):
    distance_df = pd.DataFrame([[distance for distance in row] for row in knn_distance_output[0]])
    
    sd = distance_df.std(axis=1, numeric_only=True)
    min = distance_df.min(axis=1, numeric_only=True)
    mean = distance_df.mean(axis=1, numeric_only=True)
    two_smallest_dist_df = pd.DataFrame(np.sort(distance_df.values))
    second_smallest_dist_df = two_smallest_dist_df.iloc[:, -2]
    smallest_dist_df = two_smallest_dist_df.iloc[:, -1]
    
    distance_df['sd'] = sd
    distance_df['sem'] = distance_df.sem(axis=1, numeric_only=True)
    distance_df['var'] = distance_df.var(axis=1, numeric_only=True)
    distance_df['skew'] = distance_df.skew(axis=1, numeric_only=True)
    distance_df['kurt'] = distance_df.kurt(axis=1, numeric_only=True)
    distance_df['mad'] = distance_df.mad(axis=1)
    distance_df['shortest_dist_sd'] = smallest_dist_df / sd
    distance_df['shortest_dist_diff'] = (mean - min) / sd
    distance_df['sd_diff_smallest2next'] = (second_smallest_dist_df - min) / sd
    
    if training:
        distance_df['known'] = pd.Series([label for _ in range(len(distance_df.index))]) 
    
    return distance_df

Build training dataset for known/unknown detection classifier

In [14]:
training_df = pd.DataFrame()
knn = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=30)

for out_class in distinct_classes:
    nine_classes_list = distinct_classes.tolist()
    nine_classes_list.remove(out_class)
    
    in_classes_samples = pd.DataFrame()
    nine_classes_df = filtered_df[filtered_df['device_category'] != out_class]
    out_class_df = filtered_df[filtered_df['device_category'] == out_class]
    
    for in_class in nine_classes_list:
        in_class_df = filtered_df[filtered_df['device_category'] == in_class]
        in_class_sample = in_class_df.sample(1000, random_state=1, axis=0, replace=True)
        index_to_drop = in_class_sample.index.tolist()
        nine_classes_df = nine_classes_df.drop(index_to_drop)
        in_class_sample = in_class_sample.reset_index()
        in_classes_samples = pd.concat([in_classes_samples, in_class_sample], axis=0, ignore_index=True)
    
    train_x, _, train_y, _ = train_test_split(nine_classes_df.iloc[:, :-1], nine_classes_df.iloc[:, -1], train_size=1800, 
                                              test_size=9, random_state=1, shuffle=True, stratify=nine_classes_df.iloc[:, -1])
    knn.fit(train_x, train_y)
    
    val_x, val_y = in_classes_samples.iloc[:, 1:-1], in_classes_samples.iloc[:, -1]
    val_output = knn.kneighbors(val_x, n_neighbors=9, return_distance=True)
    val_distance_df = dist_engineered_df(val_output, training=True, label=1)
    
#   val_distance_df = pd.DataFrame([[distance for distance in row] for row in val_output[0]])

#     val_sd = val_distance_df.std(axis=1, numeric_only=True)
#     val_min = val_distance_df.min(axis=1, numeric_only=True)
#     val_mean = val_distance_df.mean(axis=1, numeric_only=True)
#     val_2_smallest_dist_df = pd.DataFrame(np.sort(val_distance_df.values))
#     val_2nd_smallest_dist_df = val_2_smallest_dist_df.iloc[:, -2]
#     val_smallest_dist_df = val_2_smallest_dist_df.iloc[:, -1]
    
#     val_distance_df['sd'] = val_sd
#     val_distance_df['sem'] = val_distance_df.sem(axis=1, numeric_only=True)
#     val_distance_df['var'] = val_distance_df.var(axis=1, numeric_only=True)
#     val_distance_df['skew'] = val_distance_df.skew(axis=1, numeric_only=True)
#     val_distance_df['kurt'] = val_distance_df.kurt(axis=1, numeric_only=True)
#     val_distance_df['mad'] = val_distance_df.mad(axis=1)
#     val_distance_df['shortest_dist_sd'] = val_smallest_dist_df / val_sd
#     val_distance_df['shortest_dist_diff'] = (val_mean - val_min) / val_sd
#     val_distance_df['sd_diff_smallest2next'] = (val_2nd_smallest_dist_df - val_min) / val_sd 
#     val_distance_df['known'] = pd.Series([1 for _ in range(len(val_distance_df.index))]) 
        
    out_class_df = out_class_df.sample(n=9000, random_state=1, axis=0, replace=True)
    test_x, test_y = out_class_df.iloc[:, :-1], out_class_df.iloc[:, -1]
    test_output = knn.kneighbors(test_x, n_neighbors=9, return_distance=True)
    test_distance_df = dist_engineered_df(test_output, training=True, label=0)
    
#     test_distance_df = pd.DataFrame([[distance for distance in row] for row in test_output[0]])
#     test_sd = test_distance_df.std(axis=1, numeric_only=True)
#     test_min = test_distance_df.min(axis=1, numeric_only=True)
#     test_mean = test_distance_df.mean(axis=1, numeric_only=True)
#     test_2_smallest_dist_df = pd.DataFrame(np.sort(test_distance_df.values))
#     test_2nd_smallest_dist_df = test_2_smallest_dist_df.iloc[:, -2]
#     test_smallest_dist_df = test_2_smallest_dist_df.iloc[:, -1]

#     test_distance_df['sd'] = test_sd
#     test_distance_df['sem'] = test_distance_df.sem(axis=1, numeric_only=True)
#     test_distance_df['var'] = test_distance_df.var(axis=1, numeric_only=True)
#     test_distance_df['skew'] = test_distance_df.skew(axis=1, numeric_only=True)
#     test_distance_df['kurt'] = test_distance_df.kurt(axis=1, numeric_only=True)
#     test_distance_df['mad'] = test_distance_df.mad(axis=1)
#     test_distance_df['shortest_dist_sd'] = test_smallest_dist_df / test_sd
#     test_distance_df['shortest_dist_diff'] = (test_mean - test_min) / test_sd
#     test_distance_df['sd_diff_smallest2next'] = (test_2nd_smallest_dist_df - test_min) / test_sd 
#     test_distance_df['known'] = pd.Series([0 for _ in range(len(test_distance_df.index))]) 
    
    combined_df = pd.concat([val_distance_df, test_distance_df], axis=0, ignore_index=True).filter(items=training_cols, axis='columns')
    training_df = pd.concat([training_df, combined_df], axis=0, ignore_index=True)

Check counts of labels

In [16]:
training_df['known'].value_counts()

1    90000
0    90000
Name: known, dtype: int64

Clean up the dataset

In [17]:
training_df = training_df.replace([np.inf, -np.inf], np.nan)
training_df = training_df.replace(np.nan, 0)

Save the labelled dataset obtained from KNN models

In [18]:
joblib.dump(training_df, 'pickles/known_classifier_dataset_v5.pkl', compress=9)

['pickles/known_classifier_dataset_v5.pkl']

Load the labelled dataset obtained from KNN models

In [19]:
training_df = joblib.load('pickles/known_classifier_dataset_v5.pkl')
training_df.head()

### Prepare feature engineered dataset, splitting to train-test using 80-20 ratio
Scale using standard scaling so that models are able to better learn from dataset

In [21]:
x, y = training_df.drop('known', axis=1), training_df['known']
class_weights = compute_class_weight('balanced', np.unique(y), y)
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### Logistic Regression Model for KnownUnknown Class Detection
#### Using Stochastic Average Gradient for negate the large dataset impact on training time

In [22]:
# log_reg = LogisticRegression(penalty='l2', dual=False, class_weight='balanced', random_state=5, max_iter=500, 
#                              multi_class='ovr', verbose=2, n_jobs=-1, solver='sag')
# log_reg.fit(x_train, y_train)

convergence after 28 epochs took 2 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.5s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=5,
          solver='sag', tol=0.0001, verbose=2, warm_start=False)

In [53]:
log_reg = LogisticRegressionCV(cv=10, dual=False, penalty='l2', scoring='accuracy', solver='sag', max_iter=5000, n_jobs=-1, 
                               verbose=2, multi_class='ovr', random_state=5)
log_reg.fit(x_train, y_train)

convergence after 12 epochs took 6 seconds
convergence after 13 epochs took 6 seconds
convergence after 13 epochs took 6 seconds
convergence after 13 epochs took 6 seconds
convergence after 15 epochs took 7 seconds
convergence after 16 epochs took 8 seconds
convergence after 19 epochs took 8 seconds
convergence after 19 epochs took 9 seconds
convergence after 20 epochs took 9 seconds
convergence after 21 epochs took 9 seconds
convergence after 21 epochs took 10 seconds
convergence after 21 epochs took 10 seconds
convergence after 21 epochs took 10 seconds
convergence after 22 epochs took 11 seconds
convergence after 23 epochs took 11 seconds
convergence after 23 epochs took 11 seconds
convergence after 14 epochs took 7 seconds
convergence after 16 epochs took 7 seconds
convergence after 16 epochs took 7 seconds
convergence after 7 epochs took 3 seconds
convergence after 2 epochs took 1 seconds
convergence after 16 epochs took 7 seconds
convergence after 2 epochs took 1 seconds
converge

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished


convergence after 14 epochs took 1 seconds


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=5000,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=5,
           refit=True, scoring='accuracy', solver='sag', tol=0.0001,
           verbose=2)

Logistic Regression Model Predictions

In [54]:
log_reg_preds = log_reg.predict(x_test)
print(classification_report(y_test, log_reg_preds))

             precision    recall  f1-score   support

          0       0.61      0.67      0.64     17911
          1       0.64      0.58      0.61     18089

avg / total       0.63      0.62      0.62     36000



### Random Forest Model for KnownUnknown Class Detection
#### Using 1000 Decision Trees, Bagging and balancing class weights

In [58]:
rf = RandomForestClassifier(5000, criterion='gini', oob_score=True, n_jobs=-1,
                            random_state=5, class_weight='balanced', verbose=1)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 14.9min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=-1, oob_score=True, random_state=5,
            verbose=1, warm_start=False)

Random Forest Model Predictions

In [59]:
rf_preds = rf.predict(x_test)
print(classification_report(y_test, rf_preds))

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   10.6s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:   17.5s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:   26.6s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:   38.3s
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:   51.7s
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 5000 out of 5000 | elapsed:  1.4min finished


             precision    recall  f1-score   support

          0       0.89      0.88      0.89     17911
          1       0.88      0.89      0.89     18089

avg / total       0.89      0.89      0.89     36000



### XGBoost Model for KnownUnknown Class Detection

In [35]:
dtrain = xgb.DMatrix(x_train, label=np.array(y_train))
dtest = xgb.DMatrix(x_test, label=np.array(y_test))

In [55]:
param = {
    'max_depth': 10,
    'eta': 0.1,
    'silent': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}
num_round = 5000
early_stop = 500

In [56]:
bst = xgb.train(param, dtrain, num_round, evals=[(dtest,'eval')], verbose_eval=100, 
                early_stopping_rounds=early_stop)

[0]	eval-error:0.246778
Will train until eval-error hasn't improved in 500 rounds.
[100]	eval-error:0.178667
[200]	eval-error:0.174
[300]	eval-error:0.173167
[400]	eval-error:0.170944
[500]	eval-error:0.169917
[600]	eval-error:0.17
[700]	eval-error:0.169944
[800]	eval-error:0.169417
[900]	eval-error:0.169194
[1000]	eval-error:0.169028
[1100]	eval-error:0.169278
[1200]	eval-error:0.169
[1300]	eval-error:0.169028
[1400]	eval-error:0.169
[1500]	eval-error:0.168944
[1600]	eval-error:0.169056
[1700]	eval-error:0.169111
[1800]	eval-error:0.169167
[1900]	eval-error:0.16925
Stopping. Best iteration:
[1447]	eval-error:0.168778



In [None]:
xgb_prob_preds = bst.predict(dtest)
xgb_preds = np.asarray([1 if line > 0.5 else 0 for line in xgb_prob_preds]).astype('int64')
print(classification_report(y_test, xgb_preds))

### Neural Network Model for KnownUnknown Class Detection (3 Hidden Layers)
#### Using relu as activation functions for hidden layers due to its robustness and the absence of vanishing gradient problem when using it. Sigmoid is used as the activation function for the output as it is a binary classification problem.

#### Loss function of binary crossentropy is used as it is a binary classification problem.

In [61]:
model_name = 'known_classifier_nn.h5'

nn_model = Sequential()

nn_model.add(Dense(256, activation='relu', input_shape=tuple(x.shape[1:])))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(512, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(256, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))

op = optimizers.Adam(lr=0.0001)

nn_model.compile(optimizer=op, metrics=['accuracy'], loss='binary_crossentropy')

save_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1)
early_stop = EarlyStopping(min_delta=0.01, patience=100, verbose=1, mode='min')

nn_model.fit(x_train, y_train, epochs=10000, batch_size=128, verbose=2, 
             class_weight=class_weights, shuffle=True, validation_split=0.2, callbacks=[save_checkpoint, early_stop])

Train on 115200 samples, validate on 28800 samples
Epoch 1/10000
 - 26s - loss: 0.6636 - acc: 0.6045 - val_loss: 0.6364 - val_acc: 0.6388

Epoch 00001: val_loss improved from inf to 0.63637, saving model to known_classifier_nn.h5
Epoch 2/10000
 - 20s - loss: 0.6456 - acc: 0.6281 - val_loss: 0.6247 - val_acc: 0.6524

Epoch 00002: val_loss improved from 0.63637 to 0.62473, saving model to known_classifier_nn.h5
Epoch 3/10000
 - 21s - loss: 0.6365 - acc: 0.6374 - val_loss: 0.6175 - val_acc: 0.6545

Epoch 00003: val_loss improved from 0.62473 to 0.61747, saving model to known_classifier_nn.h5
Epoch 4/10000
 - 23s - loss: 0.6290 - acc: 0.6424 - val_loss: 0.6108 - val_acc: 0.6573

Epoch 00004: val_loss improved from 0.61747 to 0.61084, saving model to known_classifier_nn.h5
Epoch 5/10000
 - 20s - loss: 0.6232 - acc: 0.6435 - val_loss: 0.6062 - val_acc: 0.6511

Epoch 00005: val_loss improved from 0.61084 to 0.60623, saving model to known_classifier_nn.h5
Epoch 6/10000
 - 21s - loss: 0.6172 - 

<keras.callbacks.History at 0x2aa8c5456a0>

Neural Network Model Predictions

In [62]:
nn_preds = nn_model.predict(x_test)
nn_preds = np.array([1 if x > 0.5 else 0 for x in nn_preds]).astype('int64')
print(classification_report(y_test, nn_preds))

             precision    recall  f1-score   support

          0       0.77      0.71      0.74     17911
          1       0.73      0.79      0.76     18089

avg / total       0.75      0.75      0.75     36000



### Accuracy scores for all 4 models

In [63]:
y_test = np.array(y_test)

print('Logistic Regression Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, log_reg_preds), f1_score(y_test, log_reg_preds), roc_auc_score(y_test, log_reg_preds)))
print('Random Forest Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, rf_preds), f1_score(y_test, rf_preds), roc_auc_score(y_test, rf_preds)))
print('XGBoost Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, xgb_preds), f1_score(y_test, xgb_preds), roc_auc_score(y_test, xgb_preds)))
print('Neural Network Model F1 Score and AUC: %f, %f, %f' % (accuracy_score(y_test, nn_preds), f1_score(y_test, nn_preds), roc_auc_score(y_test, nn_preds)))

Logistic Regression Model F1 Score and AUC: 0.624139, 0.605981, 0.624382
Random Forest Model F1 Score and AUC: 0.886889, 0.888138, 0.886855
XGBoost Model F1 Score and AUC: 0.830861, 0.832669, 0.830828
Neural Network Model F1 Score and AUC: 0.747806, 0.757758, 0.747621


### Predicting on test set using best model

Loading submission dataset and filter to 57 features

In [64]:
submission_data = pd.read_csv('data/hackathon_IoT_validation_set_based_on_01mar2017_ANONYMIZED.csv')
submission_data_filtered = submission_data.filter(items=trimmed_feats_list, axis='columns')

Training KNN Model first to obtain distances

In [None]:
# knn_x, knn_y = filtered_df.iloc[:, :-1].sample(200000), filtered_df.iloc[:, -1].sample(200000)
knn_x, knn_y = filtered_df.iloc[:, :-1], filtered_df.iloc[:, -1]
real_knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', p=2, n_jobs=-1, leaf_size=20)
real_knn.fit(knn_x, knn_y)
sub_output = real_knn.kneighbors(submission_data_filtered, n_neighbors=10, return_distance=True)

Obtaining engineered features from submission dataset

In [None]:
# sub_distance_df = pd.DataFrame([[distance for distance in row] for row in sub_output[0]])
# sub_sd = sub_distance_df.std(axis=1, numeric_only=True)
# sub_min = sub_distance_df.min(axis=1, numeric_only=True)
# sub_mean = sub_distance_df.mean(axis=1, numeric_only=True)
# sub_2_smallest_dist_df = pd.DataFrame(np.sort(sub_distance_df.values))
# sub_2nd_smallest_dist_df = sub_2_smallest_dist_df.iloc[:, -2]
# sub_smallest_dist_df = sub_2_smallest_dist_df.iloc[:, -1]

# sub_distance_df['sd'] = sub_sd
# sub_distance_df['sem'] = sub_distance_df.sem(axis=1, numeric_only=True)
# sub_distance_df['var'] = sub_distance_df.var(axis=1, numeric_only=True)
# sub_distance_df['skew'] = sub_distance_df.skew(axis=1, numeric_only=True)
# sub_distance_df['kurt'] = sub_distance_df.kurt(axis=1, numeric_only=True)
# sub_distance_df['mad'] = sub_distance_df.mad(axis=1)
# sub_distance_df['shortest_dist_sd'] = sub_smallest_dist_df / sub_sd
# sub_distance_df['shortest_dist_diff'] = (sub_mean - sub_min) / sub_sd
# sub_distance_df['sd_diff_smallest2next'] = (sub_2nd_smallest_dist_df - sub_min) / sub_sd 

In [None]:
sub_distance_df = dist_engineered_df(sub_output)
sub_distance_df = sub_distance_df.replace([np.inf, -np.inf], 0)

In [None]:
known_sub_x = sub_distance_df.filter(items=training_cols, axis='columns')

# additional preprocessing for xgboost
sub_x = known_sub_x
sub_x.columns = bst.feature_names
sub_x = xgb.DMatrix(sub_x, label=np.zeros((sub_x.shape[0], 1)))

# known_preds = log_reg.predict(known_sub_x)
# known_preds = rf.predict(known_sub_x)
known_preds = bst.predict(sub_x)
# known_preds = nn_model.predict(known_sub_x)

In [None]:
known_preds_list = known_preds.tolist()

# log reg model output conversion
# No additional steps required

# rf model output conversion
# No additional steps required

# xgb model output conversion
known_preds_list = [1 if x > 0.5 else 0 for x in known_preds_list]

# nn model output conversion
# known_preds_list = [1 if x[0] > 0.5 else 0 for x in known_preds_list]

formatted_preds = pd.Series(['unknown' if row == 0 else None for row in known_preds_list])

In [None]:
model1_data_filtered = pd.concat([submission_data_filtered, formatted_preds], axis=1, )

In [None]:
model1_data_filtered

In [None]:
model1_data_filtered.iloc[:, -1].value_counts()

In [None]:
joblib.dump(model1_data_filtered, 'unknown_classified_data.pkl', compress=9)