In [129]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

import pandas as pd
import numpy as np
import scipy.stats.distributions as dists

import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [130]:
df = pd.read_csv('./public/data/raw_data.csv')
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [200]:
train_index = ((df.index.month == 3) &  (df.index.day > 18)) | ((df.index.month == 4) & (df.index.day < 20))
val_index = (df.index.month == 4) & (df.index.day >= 20) & (df.index.day < 24)

# 기존 학습 데이터를 알아야 Scaler를 구할 수 있음.
# fine tuning 시 scaler는 기존 scaler로 사용
existing_df = df[(df.index.month == 3) & (df.index.day < 25)]

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-03-19 00:00:00      482.0       121.0  False
 2020-03-19 00:00:06      458.0        66.0  False
 2020-03-19 00:00:12      514.0       163.0  False
 2020-03-19 00:00:18      392.0       180.0   True
 2020-03-19 00:00:24      767.0      1727.0  False
 ...                        ...         ...    ...
 2020-04-19 23:59:30      751.0      1720.0  False
 2020-04-19 23:59:36      418.0       221.0  False
 2020-04-19 23:59:42      481.0       174.0  False
 2020-04-19 23:59:48      457.0        74.0  False
 2020-04-19 23:59:54      505.0       127.0  False
 
 [460800 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-04-20 00:00:00      474.0       136.0  False
 2020-04-20 00:00:06      447.0         0.0  False
 2020-04-20 00:00:12      515.0       194.0  False
 2020-04-20 00:00:18      403.0       245.0  False
 2020-04-20 00:00:24      774.0      1730.0  False
 ...                        ...         ...    ...
 

In [201]:
existing_x_df = existing_df.copy(False)
existing_y_df = pd.DataFrame(existing_x_df.pop('TAG'), columns=['TAG'])

x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_train, y_val

(                     MELT_TEMP  MOTORSPEED
 2020-03-19 00:00:00      482.0       121.0
 2020-03-19 00:00:06      458.0        66.0
 2020-03-19 00:00:12      514.0       163.0
 2020-03-19 00:00:18      392.0       180.0
 2020-03-19 00:00:24      767.0      1727.0
 ...                        ...         ...
 2020-04-19 23:59:30      751.0      1720.0
 2020-04-19 23:59:36      418.0       221.0
 2020-04-19 23:59:42      481.0       174.0
 2020-04-19 23:59:48      457.0        74.0
 2020-04-19 23:59:54      505.0       127.0
 
 [460800 rows x 2 columns],
                        TAG
 2020-04-20 00:00:00  False
 2020-04-20 00:00:06  False
 2020-04-20 00:00:12  False
 2020-04-20 00:00:18  False
 2020-04-20 00:00:24  False
 ...                    ...
 2020-04-23 23:59:30  False
 2020-04-23 23:59:36  False
 2020-04-23 23:59:42  False
 2020-04-23 23:59:48  False
 2020-04-23 23:59:54  False
 
 [57600 rows x 1 columns])

In [202]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(existing_x_df)

train_scaled = scaler.transform(x_train)
val_scaled = scaler.transform(x_val)

val_scaled.shape

(57600, 2)

In [203]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

train_scaled, _y_train = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

print(y_train.shape[0], _y_train.shape[0])

y_train = _y_train

460800 577670


In [204]:
x_train = pd.DataFrame(
    train_scaled,
    columns=use_cols
)

x_val = pd.DataFrame(
    val_scaled,
    columns=use_cols
)

In [205]:
from joblib import load, dump

folder_root = './public/models/tree_detection'

lgbm = load(f'{folder_root}/lgbm.pkl')
cat = load(f'{folder_root}/cat.pkl')

In [209]:
pars = {
    "learning_rate": dists.uniform(0.01, 0.1),
    "boosting_type": ['gbdt'],
    "reg_alpha": dists.uniform(0.1, 1.),
    "reg_lambda": dists.uniform(0.1, 1.),
    "random_state": [0],
}

lgbm_rcv_ = RandomizedSearchCV(lgbm, param_distributions=pars, n_iter=5, cv=5, refit=True, random_state=0)
lgbm_rcv_.fit(x_train, y_train)

KeyboardInterrupt: 

In [206]:
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_val, y_val)


cat.fit(train_pool, eval_set=val_pool, verbose=True, early_stopping_rounds=10)

[1]	valid_0's binary_logloss: 0.687126
[2]	valid_0's binary_logloss: 0.683214
[3]	valid_0's binary_logloss: 0.681016
[4]	valid_0's binary_logloss: 0.680208
[5]	valid_0's binary_logloss: 0.680561
[6]	valid_0's binary_logloss: 0.681844
[7]	valid_0's binary_logloss: 0.68392
[8]	valid_0's binary_logloss: 0.686618
[9]	valid_0's binary_logloss: 0.689838
[10]	valid_0's binary_logloss: 0.693476
[11]	valid_0's binary_logloss: 0.69745
[12]	valid_0's binary_logloss: 0.701689
[13]	valid_0's binary_logloss: 0.706128
[14]	valid_0's binary_logloss: 0.710717


Default metric period is 5 because AUC is/are not implemented for GPU


0:	learn: 0.6890744	test: 0.6921379	best: 0.6921379 (0)	total: 14.9ms	remaining: 1.47s
1:	total: 20.7ms	remaining: 1.01s
2:	total: 26.3ms	remaining: 849ms
3:	total: 32ms	remaining: 768ms
4:	total: 37.7ms	remaining: 717ms
5:	learn: 0.6698759	test: 0.6878422	best: 0.6878422 (5)	total: 45.8ms	remaining: 717ms
6:	total: 51.5ms	remaining: 685ms
7:	total: 57.3ms	remaining: 659ms
8:	total: 63.1ms	remaining: 638ms
9:	total: 68.7ms	remaining: 619ms
10:	learn: 0.6524640	test: 0.6846615	best: 0.6846615 (10)	total: 76.7ms	remaining: 621ms
11:	total: 82.4ms	remaining: 604ms
12:	total: 88.3ms	remaining: 591ms
13:	total: 94.2ms	remaining: 578ms
14:	total: 100ms	remaining: 567ms
15:	learn: 0.6365801	test: 0.6824365	best: 0.6824365 (15)	total: 108ms	remaining: 567ms
16:	total: 114ms	remaining: 555ms
17:	total: 120ms	remaining: 544ms
18:	total: 125ms	remaining: 535ms
19:	total: 134ms	remaining: 536ms
20:	learn: 0.6221015	test: 0.6810307	best: 0.6810307 (20)	total: 142ms	remaining: 534ms
21:	total: 148ms

<catboost.core.CatBoostClassifier at 0x7f4948185fa0>

In [161]:
lgbm_res = lgbm.predict(x_val)
cat_res = cat.predict(data=x_val)

lgbm_pred_df = pd.DataFrame(lgbm_res.astype(int) == 1, columns=['TAG'])
cat_pred_df = pd.DataFrame((cat_res == 'True').astype(int), columns=['TAG'])

In [162]:
lgbm_pred_df

Unnamed: 0,TAG
0,False
1,False
2,False
3,False
4,False
...,...
100795,False
100796,False
100797,False
100798,False


In [163]:
p = precision_score(y_val, lgbm_pred_df)
r = recall_score(y_val, lgbm_pred_df)
f1 = f1_score(y_val, lgbm_pred_df)
acc = accuracy_score(y_val, lgbm_pred_df)

In [164]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.0000', '0.0000', '0.0000', '1.0000')

In [165]:
p = precision_score(y_val, cat_pred_df)
r = recall_score(y_val, cat_pred_df)
f1 = f1_score(y_val, cat_pred_df)
acc = accuracy_score(y_val, cat_pred_df)

In [166]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.0000', '0.0000', '0.0000', '0.7571')