**DATASET**:<br>
v3-B dataset (split by sensor)<br> 
**FEATURES**:<br>
gas_scan_1 - gas_scan_9<br>
**REFERENCE**:<br>
https://github.com/dreamquark-ai/tabnet<br>
https://www.kaggle.com/mrisdal/pytorch-tabnet-example<br>
https://towardsdatascience.com/tabnet-deep-neural-network-for-structured-tabular-data-39eb4b27a9e4

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


# **0. LIBRARY**

In [80]:
import os
import sys
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

import torch
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.externals import joblib

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# **1. LOAD DATA**

In [55]:
base = '/content/drive/MyDrive/Proyek/Omdena/Dryad/dataset-v3-B'

train = pd.read_csv(os.path.join(base, 'train_set.csv')).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1) 
val = pd.read_csv(os.path.join(base, 'valid_set.csv')).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1) 
test = pd.read_csv(os.path.join(base, 'test_set.csv')).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1) 

In [56]:
train.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,146,2021-09-01 09:03:26.565743+00:00,19.3,19.5,20.0,20.0,20.1,20.3,20.4,20.3,20.5,20.6,68.0,68.0,68.0,68.0,68.0,68.0,68.0,67.0,67.0,67.0,102400000.0,350300.0,17541700.0,300900.0,1726800.0,1395500.0,1204700.0,254200.0,408900.0,453400.0,4.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
1,146,2021-09-01 09:04:03.635421991+00:00,19.4,19.5,20.0,20.0,20.1,20.4,20.4,20.4,20.6,20.6,68.0,68.0,68.0,68.0,68.0,68.0,67.0,67.0,67.0,67.0,102400000.0,343900.0,17305500.0,296200.0,1697800.0,1378600.0,1195200.0,252000.0,408200.0,456800.0,5.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
2,146,2021-09-01 09:04:40.378066+00:00,19.4,19.6,20.1,20.1,20.1,20.4,20.4,20.4,20.6,20.7,67.0,67.0,68.0,68.0,68.0,67.0,67.0,67.0,67.0,67.0,102400000.0,362900.0,18762000.0,314400.0,1813900.0,1470700.0,1273200.0,262200.0,418400.0,469000.0,6.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
3,143,2021-09-01 09:08:14.481864929+00:00,19.1,19.3,19.8,19.8,19.8,20.1,20.1,20.1,20.3,20.4,69.0,69.0,69.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,102400000.0,156800.0,6305100.0,122700.0,619200.0,457700.0,376600.0,103200.0,225600.0,259500.0,4.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
4,143,2021-09-01 09:08:51.848716020+00:00,19.2,19.3,19.8,19.8,19.9,20.1,20.2,20.2,20.4,20.4,68.0,69.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,68.0,102400000.0,147200.0,5740700.0,117500.0,585300.0,438000.0,356100.0,99400.0,225500.0,260000.0,5.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall


In [57]:
val.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,134,2021-09-01 09:03:04.271744966+00:00,19.6,19.7,20.2,20.2,20.3,20.6,20.6,20.6,20.8,20.9,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,68.0,68.0,102400000.0,510800.0,33833700.0,453800.0,2713400.0,2365500.0,2154600.0,391500.0,560700.0,630700.0,4.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
1,134,2021-09-01 09:03:41.425556898+00:00,19.6,19.7,20.3,20.3,20.3,20.6,20.7,20.6,20.9,20.9,69.0,69.0,69.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,102400000.0,502200.0,33191100.0,446900.0,2671000.0,2341200.0,2104800.0,386400.0,559400.0,635400.0,5.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
2,134,2021-09-01 09:04:17.778662919+00:00,19.6,19.8,20.3,20.3,20.4,20.6,20.7,20.7,20.9,20.9,68.0,68.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,68.0,102400000.0,526600.0,36008700.0,471000.0,2852300.0,2493000.0,2230300.0,404100.0,577800.0,648700.0,6.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
3,134,2021-09-01 18:30:08.843652963+00:00,21.0,21.1,21.7,21.7,21.8,22.0,22.1,22.1,22.3,22.3,69.0,69.0,69.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,102400000.0,382500.0,20563500.0,328000.0,1840800.0,1587200.0,1401300.0,279200.0,442600.0,505500.0,4.0,1,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
4,134,2021-09-01 18:30:45.454761981+00:00,21.0,21.2,21.7,21.7,21.8,22.1,22.1,22.1,22.3,22.4,68.0,68.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,68.0,102400000.0,412900.0,22466900.0,349600.0,2004400.0,1706300.0,1517500.0,294400.0,464200.0,527000.0,5.0,1,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall


In [58]:
test.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,137,2021-09-01 09:04:02.821827+00:00,19.7,19.9,20.4,20.4,20.5,20.7,20.7,20.7,20.9,21.0,68.0,68.0,69.0,69.0,69.0,68.0,68.0,68.0,68.0,67.0,102400000.0,167900.0,6073700.0,139100.0,682400.0,560300.0,489200.0,119000.0,215000.0,245100.0,4.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
1,137,2021-09-01 09:04:39.888919115+00:00,19.7,19.9,20.4,20.4,20.5,20.7,20.8,20.8,21.0,21.0,68.0,68.0,69.0,68.0,69.0,68.0,68.0,68.0,68.0,67.0,102400000.0,156300.0,5593700.0,131700.0,645000.0,528200.0,465800.0,114900.0,214900.0,245500.0,5.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
2,137,2021-09-01 09:05:16.514827013+00:00,19.8,19.9,20.4,20.5,20.6,20.8,20.8,20.8,21.0,21.1,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,67.0,67.0,102400000.0,171200.0,6420000.0,142600.0,706400.0,575900.0,507000.0,121900.0,220700.0,250300.0,6.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
3,136,2021-09-01 09:00:44.123855113+00:00,19.2,19.4,19.8,19.8,19.9,20.2,20.2,20.2,20.4,20.5,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,66.0,66.0,102400000.0,185800.0,6929100.0,150100.0,752900.0,586300.0,501400.0,128000.0,259100.0,294900.0,4.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall
4,136,2021-09-01 09:01:21.075597+00:00,19.3,19.4,19.9,19.9,20.0,20.2,20.3,20.2,20.5,20.5,67.0,67.0,67.0,67.0,67.0,67.0,66.0,66.0,66.0,66.0,102400000.0,174000.0,6401200.0,144300.0,702000.0,558000.0,481600.0,124300.0,259100.0,295900.0,5.0,0,manual,SP2,600.0,2021-09-01 18:50:46+00:00,2021-09-01 10:03:57+00:00,500,6,30m,2021-09-01 08:30:22+00:00,hall


# **2. PREPARATION**

In [59]:
train.shape, val.shape, test.shape

((2238, 44), (329, 44), (673, 44))

In [60]:
features = ['gas_scan_1','gas_scan_2','gas_scan_3', 
            'gas_scan_4','gas_scan_5','gas_scan_6', 
            'gas_scan_7','gas_scan_8','gas_scan_9']

In [61]:
scl = StandardScaler()

In [62]:
X_train = train[features].to_numpy()
X_train_scale = scl.fit_transform(X_train)
y_train = train['encoded_specimen'].to_numpy()

X_val = val[features].to_numpy()
X_val_scale = scl.transform(X_val)
y_val = val['encoded_specimen'].to_numpy()

X_test = test[features].to_numpy()
X_test_scale = scl.transform(X_test)
y_test = test['encoded_specimen'].to_numpy()

# **3. MODELING**

In [63]:
def tune_save_model(estimator, param_distributions, X_train, y_train, X_val, y_val):

  rand_grid = RandomizedSearchCV(estimator, param_distributions, random_state=41, scoring='f1')

  rand_grid.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_name=['valid'],
              eval_metric=['accuracy'],
              max_epochs=1000,
              patience=50,
              batch_size=256,
              virtual_batch_size=128,
              num_workers=0,
              weights=1,
              drop_last=False)
  
  joblib.dump(rand_grid.best_estimator_, 'best_tabnet.pkl')

  return rand_grid.best_estimator_

In [65]:
def model_metrics(y_true, y_pred):

    # confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion matrix: \n', cm,'\n')

    # classification report
    print('Classification report: \n', classification_report(y_true, y_pred),'\n')

    # precision 
    print('Precision: {:0.2f}%'.format(round(precision_score(y_true, y_pred), 4)*100))

    # recall 
    print('Recall: {:0.2f}%'.format(round(recall_score(y_true, y_pred), 4)*100))

    # specificity 
    tn, fp, fn, tp = cm.ravel()
    print('Specificity: {:0.2f}%'.format(round(tn/(tn+fp), 4)*100)) 

    # f1-score
    print('F1-Score: {:0.2f}%'.format(round(f1_score(y_true, y_pred), 4)*100)) 

In [66]:
# try all possible hyperparameters

param = {'n_d' : np.arange(8,65),
         'n_a' : np.arange(8,65),
         'n_steps' : np.arange(3,10),
         'gamma' : np.arange(1,2.1,0.1),
         'n_independent' : np.arange(1,6),
         'n_shared' : np.arange(1,6)}

In [74]:
best_model = tune_save_model(TabNetClassifier(), param, X_train, y_train, X_val, y_val)

Device used : cpu
Device used : cpu
Device used : cpu
epoch 0  | loss: 1.26871 | valid_accuracy: 0.67173 |  0:00:01s
epoch 1  | loss: 1.02209 | valid_accuracy: 0.6079  |  0:00:02s
epoch 2  | loss: 1.2247  | valid_accuracy: 0.33739 |  0:00:03s
epoch 3  | loss: 0.80684 | valid_accuracy: 0.25228 |  0:00:04s
epoch 4  | loss: 0.66536 | valid_accuracy: 0.38298 |  0:00:05s
epoch 5  | loss: 0.66362 | valid_accuracy: 0.18541 |  0:00:06s
epoch 6  | loss: 0.66998 | valid_accuracy: 0.58055 |  0:00:07s
epoch 7  | loss: 0.64617 | valid_accuracy: 0.52888 |  0:00:08s
epoch 8  | loss: 0.66055 | valid_accuracy: 0.71733 |  0:00:09s
epoch 9  | loss: 0.5886  | valid_accuracy: 0.85106 |  0:00:11s
epoch 10 | loss: 0.57505 | valid_accuracy: 0.85714 |  0:00:12s
epoch 11 | loss: 0.52701 | valid_accuracy: 0.8845  |  0:00:13s
epoch 12 | loss: 0.53512 | valid_accuracy: 0.90881 |  0:00:14s
epoch 13 | loss: 0.4966  | valid_accuracy: 0.87842 |  0:00:15s
epoch 14 | loss: 0.46768 | valid_accuracy: 0.89058 |  0:00:16s
e

In [75]:
best_model

TabNetClassifier(n_d=16, n_a=47, n_steps=7, gamma=1.6000000000000005, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=5, n_shared=1, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=9, output_dim=2, device_name='auto')

In [76]:
y_pred = best_model.predict(X_test)

In [77]:
model_metrics(y_test, y_pred)

Confusion matrix: 
 [[  6  49]
 [ 25 593]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.19      0.11      0.14        55
           1       0.92      0.96      0.94       618

    accuracy                           0.89       673
   macro avg       0.56      0.53      0.54       673
weighted avg       0.86      0.89      0.88       673
 

Precision: 92.37%
Recall: 95.95%
Specificity: 10.91%
F1-Score: 94.13%


In [82]:
print('Model size in kilobytes: {}'.format(sys.getsizeof(pickle.dumps(best_model)) / 1000))

Model size in kilobytes: 5942.564
