In [18]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

import pandas as pd
import numpy as np
import scipy.stats.distributions as dists

import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('./public/data/raw_data.csv')
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [123]:
train_index = ((df.index.month == 3) & (df.index.day >= 15))
val_index = ((df.index.month == 4) & (df.index.day < 7))

# 기존 학습 데이터를 알아야 Scaler를 구할 수 있음.
# fine tuning 시 scaler는 기존 scaler로 사용
existing_df = df[(df.index.month == 3) & (df.index.day < 25)]

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-03-15 00:00:00      458.0       138.0  False
 2020-03-15 00:00:06      442.0        70.0  False
 2020-03-15 00:00:12      474.0       170.0  False
 2020-03-15 00:00:18      378.0       202.0  False
 2020-03-15 00:00:24      718.0      1732.0  False
 ...                        ...         ...    ...
 2020-03-31 23:59:30      755.0      1732.0  False
 2020-03-31 23:59:36      420.0       213.0   True
 2020-03-31 23:59:42      458.0       161.0   True
 2020-03-31 23:59:48      421.0         0.0   True
 2020-03-31 23:59:54      514.0       141.0  False
 
 [244800 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-04-01 00:00:00      463.0       135.0  False
 2020-04-01 00:00:06      412.0       118.0   True
 2020-04-01 00:00:12      482.0       178.0  False
 2020-04-01 00:00:18      394.0       216.0   True
 2020-04-01 00:00:24      721.0      1733.0  False
 ...                        ...         ...    ...
 

In [124]:
existing_x_df = existing_df.copy(False)
existing_y_df = pd.DataFrame(existing_x_df.pop('TAG'), columns=['TAG'])

x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_train, y_val

(                     MELT_TEMP  MOTORSPEED
 2020-03-15 00:00:00      458.0       138.0
 2020-03-15 00:00:06      442.0        70.0
 2020-03-15 00:00:12      474.0       170.0
 2020-03-15 00:00:18      378.0       202.0
 2020-03-15 00:00:24      718.0      1732.0
 ...                        ...         ...
 2020-03-31 23:59:30      755.0      1732.0
 2020-03-31 23:59:36      420.0       213.0
 2020-03-31 23:59:42      458.0       161.0
 2020-03-31 23:59:48      421.0         0.0
 2020-03-31 23:59:54      514.0       141.0
 
 [244800 rows x 2 columns],
                        TAG
 2020-04-01 00:00:00  False
 2020-04-01 00:00:06   True
 2020-04-01 00:00:12  False
 2020-04-01 00:00:18   True
 2020-04-01 00:00:24  False
 ...                    ...
 2020-04-06 23:59:30  False
 2020-04-06 23:59:36  False
 2020-04-06 23:59:42  False
 2020-04-06 23:59:48  False
 2020-04-06 23:59:54  False
 
 [86400 rows x 1 columns])

In [125]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(existing_x_df)

train_scaled = scaler.transform(x_train)
val_scaled = scaler.transform(x_val)

val_scaled.shape

(86400, 2)

In [126]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

train_scaled, _y_train = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

print(y_train.shape[0], _y_train.shape[0])

y_train = _y_train

244800 259758


In [127]:
x_train = pd.DataFrame(
    train_scaled,
    columns=use_cols
)

x_val = pd.DataFrame(
    val_scaled,
    columns=use_cols
)

In [128]:
from joblib import load, dump

folder_root = './public/models/tree_detection'

lgbm = load(f'{folder_root}/lgbm.pkl')
cat = load(f'{folder_root}/cat.pkl')

In [129]:
lgbm.fit(x_train, y_train, eval_set=(x_val, y_val), verbose=True, early_stopping_rounds=3)

[1]	valid_0's binary_logloss: 0.660673
[2]	valid_0's binary_logloss: 0.631922
[3]	valid_0's binary_logloss: 0.606349
[4]	valid_0's binary_logloss: 0.58347
[5]	valid_0's binary_logloss: 0.562946
[6]	valid_0's binary_logloss: 0.544482
[7]	valid_0's binary_logloss: 0.527831
[8]	valid_0's binary_logloss: 0.512751
[9]	valid_0's binary_logloss: 0.499084
[10]	valid_0's binary_logloss: 0.486693
[11]	valid_0's binary_logloss: 0.475411
[12]	valid_0's binary_logloss: 0.46516
[13]	valid_0's binary_logloss: 0.455808
[14]	valid_0's binary_logloss: 0.447264
[15]	valid_0's binary_logloss: 0.439481
[16]	valid_0's binary_logloss: 0.43237
[17]	valid_0's binary_logloss: 0.42585
[18]	valid_0's binary_logloss: 0.419865
[19]	valid_0's binary_logloss: 0.414396
[20]	valid_0's binary_logloss: 0.409388
[21]	valid_0's binary_logloss: 0.4048
[22]	valid_0's binary_logloss: 0.400597
[23]	valid_0's binary_logloss: 0.396721
[24]	valid_0's binary_logloss: 0.393173
[25]	valid_0's binary_logloss: 0.389919
[26]	valid_0's 

In [130]:
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_val, y_val)

cat.fit(train_pool, eval_set=val_pool, verbose=True, early_stopping_rounds=3)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	learn: 0.6891970	test: 0.6882204	best: 0.6882204 (0)	total: 16.5ms	remaining: 1.63s
1:	total: 22.3ms	remaining: 1.09s
2:	total: 28ms	remaining: 906ms
3:	total: 33.8ms	remaining: 812ms
4:	total: 39.9ms	remaining: 758ms
5:	learn: 0.6705758	test: 0.6648842	best: 0.6648842 (5)	total: 48.3ms	remaining: 756ms
6:	total: 54.2ms	remaining: 720ms
7:	total: 60.4ms	remaining: 695ms
8:	total: 66.5ms	remaining: 672ms
9:	total: 73.3ms	remaining: 659ms
10:	learn: 0.6536933	test: 0.6435711	best: 0.6435711 (10)	total: 81.6ms	remaining: 660ms
11:	total: 87.4ms	remaining: 641ms
12:	total: 93.4ms	remaining: 625ms
13:	total: 99.6ms	remaining: 612ms
14:	total: 105ms	remaining: 598ms
15:	learn: 0.6383238	test: 0.6239570	best: 0.6239570 (15)	total: 114ms	remaining: 598ms
16:	total: 120ms	remaining: 586ms
17:	total: 126ms	remaining: 574ms
18:	total: 132ms	remaining: 562ms
19:	total: 138ms	remaining: 552ms
20:	learn: 0.6242845	test: 0.6058818	best: 0.6058818 (20)	total: 146ms	remaining: 550ms
21:	total: 152ms

<catboost.core.CatBoostClassifier at 0x7f93f8698070>

In [131]:
lgbm_res = lgbm.predict(x_val)
cat_res = cat.predict(data=x_val)

lgbm_pred_df = pd.DataFrame(lgbm_res.astype(int) == 1, columns=['TAG'])
cat_pred_df = pd.DataFrame((cat_res == 'True').astype(int), columns=['TAG'])

In [132]:
lgbm_pred_df

Unnamed: 0,TAG
0,True
1,True
2,False
3,True
4,False
...,...
86395,False
86396,True
86397,False
86398,False


In [133]:
p = precision_score(y_val, lgbm_pred_df)
r = recall_score(y_val, lgbm_pred_df)
f1 = f1_score(y_val, lgbm_pred_df)
acc = accuracy_score(y_val, lgbm_pred_df)

In [134]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.8776', '0.9874', '0.9293', '0.9389')

In [135]:
p = precision_score(y_val, cat_pred_df)
r = recall_score(y_val, cat_pred_df)
f1 = f1_score(y_val, cat_pred_df)
acc = accuracy_score(y_val, cat_pred_df)

In [136]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.8753', '0.9869', '0.9278', '0.9375')

In [137]:
from joblib import dump

folder_root = './public/models/tree_detection'

dump(lgbm, f'{folder_root}/lgbm_fine_tuned.pkl')
dump(cat, f'{folder_root}/cat_fine_tuned.pkl')

['./public/models/tree_detection/cat_fine_tuned.pkl']