**DATASET:**<br>
v3-C (gas_cnt<=3 dropped and split randomly)<br>
**FEATURES:**<br>
gas_scan_0 - gas_scan_1 (suggested to exclude temperature and humidity)

# **0. IMPORT LIBRARIES**

In [144]:
import pickle
import joblib
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [80]:
sns.set_theme()

# **1. LOAD DATA**

In [81]:
train_url = 'https://drive.google.com/uc?id=1ckLvp7H-GkhKxAZ78iG3lE8qBbhVszSB'
val_url = 'https://drive.google.com/uc?id=1ooyHriS3TBzVDx9z5HJgbbhjshx0rhex'
test_url = 'https://drive.google.com/uc?id=1wGA0ImoO-CQlIPYVi3fGr0arqYnBP2Eq'

In [82]:
# load data

train = pd.read_csv(train_url).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1) 
val = pd.read_csv(val_url).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1)  
test = pd.read_csv(test_url).drop(columns=['Unnamed: 0','Unnamed: 0.1'], axis=1) 

In [83]:
train.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,143,2021-09-21 19:47:05.761527061+00:00,17.8,17.9,18.4,18.4,18.5,18.7,18.8,18.7,18.9,19.0,61.0,61.0,62.0,62.0,62.0,62.0,61.0,61.0,61.0,61.0,102400000.0,273800.0,12566800.0,221500.0,1223000.0,919200.0,762100.0,180100.0,340800.0,396500.0,5.0,1,manual,BM2,600.0,2021-09-22 08:00:00+00:00,2021-09-21 12:04:00+00:00,320,22,30m,2021-09-21 08:30:00+00:00,hall
1,132,2021-09-06 12:02:01.676606893+00:00,22.0,22.1,22.6,22.6,22.7,22.9,23.0,23.0,23.2,23.2,56.0,56.0,57.0,57.0,57.0,57.0,56.0,56.0,56.0,56.0,11074000.0,39100.0,437500.0,31600.0,109000.0,67300.0,51000.0,22500.0,54400.0,61600.0,6.0,1,manual,Scott Pine branches in fire bowl,,2021-09-06 16:30:22+00:00,2021-09-06 10:58:09+00:00,open fire,9,30m,2021-09-06 09:04:03+00:00,hall
2,146,2021-09-16 13:59:47.711469+00:00,21.6,21.8,22.3,22.3,22.4,22.7,22.7,22.7,22.9,22.9,62.0,62.0,62.0,62.0,62.0,62.0,61.0,61.0,61.0,61.0,102400000.0,306800.0,11141700.0,255000.0,1366200.0,1010300.0,827300.0,212000.0,393800.0,447200.0,4.0,1,manual,SP1,700.0,2021-09-16 15:33:00+00:00,2021-09-16 10:25:00+00:00,320,18,30m,2021-09-16 08:37:00+00:00,hall
3,134,2021-09-21 14:00:00.936173915+00:00,18.8,18.9,19.4,19.4,19.5,19.8,19.8,19.8,20.0,20.1,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,57.0,102400000.0,375200.0,17741200.0,311900.0,1719200.0,1420200.0,1216500.0,256700.0,431400.0,496700.0,4.0,1,manual,BM2,600.0,2021-09-22 08:00:00+00:00,2021-09-21 12:04:00+00:00,320,22,30m,2021-09-21 08:30:00+00:00,hall
4,139,2021-09-22 11:52:18.181519985+00:00,18.6,18.8,19.3,19.3,19.3,19.6,19.6,19.6,19.8,19.8,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,66.0,66.0,33755300.0,71600.0,654900.0,54200.0,206500.0,123300.0,97600.0,43700.0,114900.0,131600.0,6.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall


In [84]:
val.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,136,2021-09-22 13:07:24.506938934+00:00,18.7,18.9,19.4,19.4,19.4,19.7,19.7,19.7,19.9,20.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,63.0,17669400.0,60700.0,482600.0,44600.0,163200.0,107100.0,87300.0,40600.0,122400.0,141700.0,6.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall
1,130,2021-09-17 13:41:44.463860034+00:00,20.0,20.1,20.6,20.6,20.7,20.9,21.0,20.9,21.1,21.2,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,62.0,102400000.0,609300.0,36459500.0,528600.0,3414700.0,2814100.0,2477100.0,448100.0,635400.0,708600.0,6.0,1,manual,SP1,700.0,2021-09-20 19:30:00+00:00,2021-09-17 10:15:00+00:00,500,21,30m,2021-09-17 08:45:00+00:00,hall
2,143,2021-09-22 21:08:29.079322099+00:00,17.8,17.9,18.4,18.4,18.5,18.7,18.8,18.8,19.0,19.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,65.0,65.0,102400000.0,201400.0,6124800.0,152000.0,795000.0,556200.0,456500.0,132400.0,287700.0,324600.0,4.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall
3,139,2021-09-22 01:41:03.301168+00:00,17.4,17.6,18.1,18.1,18.2,18.4,18.4,18.4,18.6,18.6,67.0,67.0,67.0,67.0,67.0,67.0,66.0,66.0,66.0,66.0,102400000.0,358700.0,17223600.0,304100.0,1827300.0,1427600.0,1208900.0,257100.0,435200.0,489600.0,5.0,1,manual,BM2,600.0,2021-09-22 08:00:00+00:00,2021-09-21 12:04:00+00:00,320,22,30m,2021-09-21 08:30:00+00:00,hall
4,132,2021-09-22 15:15:14.457976102+00:00,19.1,19.2,19.7,19.7,19.8,20.0,20.1,20.0,20.2,20.3,64.0,64.0,64.0,64.0,64.0,64.0,63.0,63.0,63.0,62.0,102400000.0,197800.0,6736800.0,159300.0,839500.0,635400.0,524500.0,127800.0,239900.0,274700.0,6.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall


In [85]:
test.head()

Unnamed: 0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8,temp_9,humid_0,humid_1,humid_2,humid_3,humid_4,humid_5,humid_6,humid_7,humid_8,humid_9,gas_scan_0,gas_scan_1,gas_scan_2,gas_scan_3,gas_scan_4,gas_scan_5,gas_scan_6,gas_scan_7,gas_scan_8,gas_scan_9,gas_scan_cnt,encoded_specimen,trigger,burn_material,burn_material_amount(g),end_time,hotplate_start,hotplate_temp,experiment,sensor_hotplate_distance,start_time,venue
0,140,2021-09-16 12:26:32.090624+00:00,21.5,21.7,22.1,22.1,22.2,22.5,22.5,22.5,22.7,22.7,64.0,65.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,102400000.0,541200.0,32524000.0,506600.0,3167200.0,2634900.0,2284400.0,412600.0,527000.0,582300.0,5.0,1,manual,SP1,700.0,2021-09-16 15:33:00+00:00,2021-09-16 10:25:00+00:00,320,18,30m,2021-09-16 08:37:00+00:00,hall
1,146,2021-09-23 02:17:25.933338+00:00,17.1,17.2,17.7,17.7,17.8,18.1,18.1,18.1,18.3,18.3,63.0,63.0,64.0,64.0,64.0,64.0,63.0,63.0,63.0,63.0,102400000.0,322800.0,12452200.0,261800.0,1450400.0,1096300.0,900400.0,221800.0,410900.0,460200.0,6.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall
2,130,2021-09-21 11:00:14.395855903+00:00,18.4,18.5,19.0,19.0,19.1,19.3,19.4,19.4,19.6,19.6,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,102400000.0,822300.0,70054496.0,771200.0,5486900.0,4936400.0,4584200.0,675000.0,860100.0,938800.0,5.0,0,manual,BM2,600.0,2021-09-22 08:00:00+00:00,2021-09-21 12:04:00+00:00,320,22,30m,2021-09-21 08:30:00+00:00,hall
3,146,2021-09-22 06:40:33.705244064+00:00,17.3,17.4,17.9,17.9,18.0,18.3,18.3,18.3,18.5,18.5,67.0,67.0,67.0,67.0,67.0,67.0,66.0,66.0,66.0,66.0,102400000.0,361800.0,17655100.0,304600.0,1760200.0,1382800.0,1171600.0,257300.0,451900.0,507000.0,5.0,1,manual,BM2,600.0,2021-09-22 08:00:00+00:00,2021-09-21 12:04:00+00:00,320,22,30m,2021-09-21 08:30:00+00:00,hall
4,132,2021-09-22 16:15:40.700624942+00:00,19.0,19.2,19.6,19.6,19.7,20.0,20.0,20.0,20.2,20.2,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,102400000.0,218400.0,7850500.0,175100.0,947200.0,727500.0,600700.0,140300.0,259700.0,289600.0,5.0,1,manual,Other,,2021-09-23 08:07:00+00:00,2021-09-22 10:32:00+00:00,open fire,23,30m,2021-09-22 08:30:00+00:00,hall


# **2. DATA PREPROCESSING**

In [86]:
train.shape, val.shape, test.shape

((2239, 44), (302, 44), (699, 44))

In [87]:
features = ['gas_scan_0', 'gas_scan_1',
            'gas_scan_2', 'gas_scan_3', 
            'gas_scan_4', 'gas_scan_5', 
            'gas_scan_6', 'gas_scan_7', 
            'gas_scan_8', 'gas_scan_9']

In [88]:
scl = StandardScaler()

In [89]:
X_train = train[features]
X_train_scale = scl.fit_transform(X_train)
y_train = train['encoded_specimen']

X_val = val[features]
X_val_scale = scl.transform(X_val)
y_val = val['encoded_specimen']

X_test = test[features]
X_test_scale = scl.transform(X_test)
y_test = test['encoded_specimen']

In [90]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((2239, 10), (2239,), (302, 10), (302,), (699, 10), (699,))

# **3. MODELING**

In [93]:
def build_model(num_node_first, num_node_second):

  model = Sequential()
  
  model.add(Dense(num_node_first, input_dim=X_train_scale.shape[1], activation='relu'))
  model.add(Dense(num_node_second, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  return (model)

In [140]:
def model_metrics(y_true, y_pred, y_pred_proba):

    # confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion matrix: \n', cm,'\n')

    # classification report
    print('Classification report: \n', classification_report(y_true, y_pred),'\n')

    # precision 
    print('Precision: {:0.2f}%'.format(round(precision_score(y_true, y_pred), 4)*100))

    # recall 
    print('Recall: {:0.2f}%'.format(round(recall_score(y_true, y_pred), 4)*100))

    # specificity 
    tn, fp, fn, tp = cm.ravel()
    print('Specificity: {:0.2f}%'.format(round(tn/(tn+fp), 4)*100)) 

    # f1-score
    print('F1-Score: {:0.2f}%'.format(round(f1_score(y_true, y_pred), 4)*100)) 

    # AUC
    print('AUC: {:0.3f}'.format(round(roc_auc_score(y_true, y_pred_proba), 4)))

In [97]:
model = KerasClassifier(build_fn=build_model, epochs=200, verbose=0)

In [101]:
first_nodes = np.arange(10,21)
second_nodes = np.arange(5,21)

param = dict(num_node_first=first_nodes, num_node_second=second_nodes)

In [102]:
grid = RandomizedSearchCV(estimator=model, param_distributions=param, random_state=42)

grid.fit(X_train_scale, y_train, validation_data=(X_val_scale,y_val))

In [110]:
grid.best_params_

{'num_node_first': 20, 'num_node_second': 17}

In [142]:
y_pred = (grid.predict(X_test_scale) > 0.5).astype("int32")
y_pred_proba = grid.predict_proba(X_test_scale)[:, 1]

In [143]:
model_metrics(y_test, y_pred, y_pred_proba)

Confusion matrix: 
 [[ 20  55]
 [  2 622]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.91      0.27      0.41        75
           1       0.92      1.00      0.96       624

    accuracy                           0.92       699
   macro avg       0.91      0.63      0.68       699
weighted avg       0.92      0.92      0.90       699
 

Precision: 91.88%
Recall: 99.68%
Specificity: 26.67%
F1-Score: 95.62%
AUC: 0.844
