In [None]:
from darts import TimeSeries
from datetime import datetime
from matplotlib.pylab import rcParams
from tqdm import tqdm_notebook as tqdm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from darts.dataprocessing.transformers import Scaler
from lightgbm import LGBMClassifier
from torch.utils.data import DataLoader, TensorDataset
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from darts.metrics import mape
import matplotlib.pyplot as plt


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
rcParams['figure.figsize'] = 15, 5

train_mode = False

# 1. Data 처리

## 1-1. Data Load

In [None]:
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df_ = pd.read_csv('./public/data/raw_data.csv')

df_['TAG'] = df_['TAG'] == 'NG'

df_.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df_.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df_ = df_.astype(np.float32)
df_['TAG'] = df_['TAG'].astype(bool)

df_.describe()

In [None]:
df = df_.copy(False)

val_start_day = 25

train_index = (df.index.month == 3) & (df.index.day < val_start_day)
val_index = (df.index.month == 3) & (df.index.day >= val_start_day)

train_df = df[train_index]
val_df = df[val_index]
total_df = df

x_train = train_df
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_total = total_df
y_total = pd.DataFrame(x_total.pop('TAG'), columns=['TAG'])

x_train = TimeSeries.from_dataframe(x_train, freq='6S', fill_missing_dates=True)
y_train = TimeSeries.from_dataframe(y_train, freq='6S', fill_missing_dates=True)
x_val = TimeSeries.from_dataframe(x_val, freq='6S', fill_missing_dates=True)
y_val = TimeSeries.from_dataframe(y_val, freq='6S', fill_missing_dates=True)
x_total = TimeSeries.from_dataframe(x_total, freq='6S', fill_missing_dates=True)
y_total = TimeSeries.from_dataframe(y_total, freq='6S', fill_missing_dates=True)

len(x_train), len(x_val), len(x_total)

## 1-2. Data Scaling

In [None]:
scaler = Scaler(scaler=MinMaxScaler())

x_train = scaler.fit_transform(x_train).astype(np.float32)
x_val = scaler.transform(x_val).astype(np.float32)
x_total = scaler.transform(x_total).astype(np.float32)

x_train

# 2. 시계열 예측 모델

## 2-1. 모델 구성

In [None]:
from darts.models import TransformerModel
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

forecast_window_size = 30

my_stopper = EarlyStopping(
    monitor="val_loss",
    patience=3,
    min_delta=0.001,
    mode='min',
)

model_name = 'transformer'
epoch = 10

forecast_model = TransformerModel(
    input_chunk_length=forecast_window_size,
    output_chunk_length=1,
    batch_size=512,
    dropout=0.15,
    d_model=16,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dim_feedforward=16,
    model_name=model_name,
    activation="relu",
    random_state=42,
    pl_trainer_kwargs={
        "accelerator": "gpu",
        "devices": [0]
    }
)

## 2-2. 모델 학습

In [None]:
if train_mode:
    forecast_model.fit(
        series=x_train,
        val_series=x_val,
        verbose=True,
        epochs=epoch
    )
else:
    forecast_model = forecast_model.load(
        f'./public/models/forecasting/{forecast_model.model_name}.pt'
    )

## 2-3. 4월 첫째 주 데이터 추론

In [None]:
min_size = 10
hour_size = min_size * 60
day_size = hour_size * 24
week_size = day_size * 7

pred = forecast_model.predict(
    n=week_size,
    series=x_total[:len(x_train) + len(x_val)],
    verbose=False
)

pred

In [None]:
forecasted_df = pred.pd_dataframe()

ground_truth = y_total[pred.time_index].values().reshape(-1, 1).astype(int)

forecasted_df['GT'] = ground_truth

forecasted_df

# 3. LightGBM + CatBoost 모델

## 3-1. SMOTE를 활용한 데이터 증강

In [None]:
smote = SMOTE(random_state=0)

x_train_over, y_train_over = smote.fit_resample(
    X=x_train.values(),
    y=y_train.values()
)

len(x_train), len(x_train_over)

In [None]:
x_train_over_df = pd.DataFrame(
    x_train_over,
    columns=use_cols
)

y_train_over_df = pd.DataFrame(
    y_train_over,
    columns=['TAG']
)

x_val_df = x_val.pd_dataframe()
y_val_df = y_val.pd_dataframe()

x_train_over_df

## 3-2. LGBM + CatBoost 학습

In [None]:
pars = {
    "learning_rate": [0.06680445610939323],
    "boosting_type": ['gbdt'],
    "reg_alpha": [1.0255966382926611],
    "reg_lambda": [0.17103605819788695],
    "random_state": [0],
}

lgbm_clf = LGBMClassifier(random_state=0)
lgbm_rcv_ = RandomizedSearchCV(
    lgbm_clf,
    param_distributions=pars,
    n_iter=15,
    cv=5,
    refit=True,
    random_state=0
)

lgbm_rcv_.fit(x_train_over_df, y_train_over_df)
lgbm = lgbm_rcv_.best_estimator_

In [None]:
pars = {
    'iterations': 100,
    'random_seed': 0,
    'learning_rate': 0.01,
    'loss_function': 'Logloss',
    'custom_metric': ['Logloss', 'AUC'],
    'early_stopping_rounds': 20,
    'bagging_temperature': 1,
    'verbose': False,
}

cat = CatBoostClassifier(**pars)
cat.fit(Pool(x_train_over_df, y_train_over_df))

cat

## 3-3. 예측된 4월 1일 ~ 4월 7일 데이터에 대한 이상탐지

In [None]:
forecated_input_df = forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']]

lgbm_pred = lgbm.predict(forecated_input_df).astype(int)
cat_pred = cat.predict(forecated_input_df)

forecasted_df['LGBM_PRED'] = lgbm_pred.astype(int)
forecasted_df['CAT_PRED'] = cat_pred.astype(int)

forecasted_df

# 4. BI-LSTM 기반 이상탐지 모델

## 4-1. BI-LSTM 모델 구성

In [None]:
h_size = 8

detection_window_size = 10

def make_dataset(data, label, window_size=10):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size, :]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=detection_window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

## 4-2. 입력 윈도우 및 데이터 로더 구성

In [None]:
x_train_window, y_train_window = make_dataset(x_train_over_df, y_train_over_df)

x_val_window, y_val_window = make_dataset(x_val_df, y_val_df)

In [None]:
train_bs = 256
val_bs = 1024

x_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_train_window),
    batch_size=train_bs,
    shuffle=False
)

y_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_train_window),
    batch_size=train_bs,
    shuffle=False
)

x_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_val_window),
    batch_size=val_bs,
    shuffle=False
)

y_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_val_window),
    batch_size=val_bs,
    shuffle=False
)

## 4-3. 모델 학습

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")

min_valid = 1e9
best_model = MockUpModel().to(device)
lstm_model = MockUpModel().to(device)

lr = 1e-3
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)

loss_fn = nn.BCELoss()

In [None]:
n_epochs = 20

for i in range(n_epochs):
    batch_loss = 0.
    lstm_model.train()
    for x, y in tqdm(zip(x_train_dataloader, y_train_dataloader)):
        optimizer.zero_grad()
        out = lstm_model(x.to(device))
        loss = loss_fn(out, y.to(device))
        loss.backward()
        optimizer.step()
        batch_loss += loss.cpu().item()
    lstm_model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(lstm_model(x.to(device)), y.to(device)).cpu().item() for x, y in zip(x_val_dataloader, y_val_dataloader))
    batch_loss /= len(x_train_dataloader)
    valid_loss /= len(x_val_dataloader)

    if min_valid >= valid_loss:
        min_valid = valid_loss
        best_model.load_state_dict(lstm_model.state_dict())
    print(f'{i}: loss: {batch_loss}, valid: {valid_loss}')
lstm_model = best_model

## 4-4. 예측된 4월 1일 ~ 4월 7일 값에 대한 이상탐지 수행

In [None]:
seened_index = len(x_train) + len(x_val)

past_data_for_window = x_total[seened_index - 10 : seened_index].values()
past_y_data_for_window = y_total[seened_index - 10 : seened_index].values()

forecasted_data = forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']].values
forecasted_y_data = forecasted_df.loc[:, ['GT']].values

input_x_data = np.concatenate([past_data_for_window, forecasted_data])
input_y_data = np.concatenate([past_y_data_for_window, forecasted_y_data])

input_x_data, input_y_data = make_dataset(
    pd.DataFrame(input_x_data),
    pd.DataFrame(input_y_data),
    detection_window_size
)

len(input_x_data) == len(input_y_data)

In [None]:
x_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_x_data),
    batch_size=2048,
    shuffle=False
)

y_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_y_data),
    batch_size=2048,
    shuffle=False
)

In [None]:
result = []

for x, y in tqdm(zip(x_dataloader, y_dataloader)):
    with torch.no_grad():
        res = lstm_model(x.to(device)).cpu().numpy()
        res = (res >= 0.5).astype(int)
        result.extend(res)
forecasted_df['NN_PRED'] = np.array(result)

# 5. 4월1일 ~ 4월7일 결과

In [None]:
forecasted_df['FINAL'] = (((forecasted_df['LGBM_PRED'] + forecasted_df['CAT_PRED'] + forecasted_df['NN_PRED']) / 3) >= 0.5).astype(int)

pred_df = forecasted_df.loc[:, ['FINAL']]
real_df = forecasted_df.loc[:, ['GT']]

In [None]:
p = precision_score(real_df, pred_df)
r = recall_score(real_df, pred_df)
f1 = f1_score(real_df, pred_df)
acc = accuracy_score(real_df, pred_df)

In [None]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

# 6. 4월8일 ~ 4월14일에 대한 동일한 진행
## 실제 시스템에는 MLOps가 필수적

In [None]:
min_size = 10
hour_size = min_size * 60
day_size = hour_size * 24
week_size = day_size * 7

idx = (total_df.index.month == 3) | ((total_df.index.month == 4) & (total_df.index.day <= 7))
seened_index = len(total_df[idx])

pred = forecast_model.predict(
    n=week_size,
    series=x_total[:seened_index],
    verbose=False
)

pred

In [None]:
forecasted_df = pred.pd_dataframe()

ground_truth = y_total[pred.time_index].values().reshape(-1, 1).astype(int)

forecasted_df['GT'] = ground_truth

forecasted_df

In [None]:
lgbm_pred = lgbm.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']])
cat_pred = cat.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']])

forecasted_df['LGBM_PRED'] = lgbm_pred.astype(int)
forecasted_df['CAT_PRED'] = cat_pred.astype(int)

forecasted_df

In [None]:
seened_index = seened_idx

past_data_for_window = x_total[seened_index - 10 : seened_index].values()
past_y_data_for_window = y_total[seened_index - 10 : seened_index].values()

forecasted_data = forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']].values
forecasted_y_data = forecasted_df.loc[:, ['GT']].values

input_x_data = np.concatenate([past_data_for_window, forecasted_data])
input_y_data = np.concatenate([past_y_data_for_window, forecasted_y_data])

input_x_data, input_y_data = make_dataset(
    pd.DataFrame(input_x_data),
    pd.DataFrame(input_y_data),
    detection_window_size
)

len(input_x_data) == len(input_y_data)

In [None]:
x_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_x_data),
    batch_size=2048,
    shuffle=False
)

y_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_y_data),
    batch_size=2048,
    shuffle=False
)

In [None]:
result = []

for x, y in tqdm(zip(x_dataloader, y_dataloader)):
    with torch.no_grad():
        res = lstm_model(x.to(device)).cpu().numpy()
        res = (res >= 0.5).astype(int)
        result.extend(res)
forecasted_df['NN_PRED'] = np.array(result)

In [None]:
forecasted_df['FINAL'] = (((forecasted_df['LGBM_PRED'] + forecasted_df['CAT_PRED'] + forecasted_df['NN_PRED']) / 3) >= 0.5).astype(int)

pred_df = forecasted_df.loc[:, ['FINAL']]
real_df = forecasted_df.loc[:, ['GT']]

In [None]:
p = precision_score(real_df, pred_df)
r = recall_score(real_df, pred_df)
f1 = f1_score(real_df, pred_df)
acc = accuracy_score(real_df, pred_df)

In [None]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc