# Pipeline

In [1]:
from darts import TimeSeries
from datetime import datetime
from darts.dataprocessing.transformers import Scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

from dateutil.parser import parse
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
import matplotlib.pyplot as plt


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

## 1. Data Load

In [2]:
df = pd.read_csv('./public/data/raw_data.csv')
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


## 2. Data 준비

In [3]:
# 모든 모델은 3월의 데이터만으로 학습 & 평가 하였음.
# 파이프라인의 중요성을 확인하기 위해 학습 데이터와 가까운 일주일의 데이터만 가지고  테스트 할 것임

val_start_day = 25

trained_index = (df.index.month == 3) & (df.index.day < val_start_day)
seened_index = df.index.month == 3
unseened_index = (df.index.month == 4) & (df.index.day >= 24)

total_df = df.copy(False)

In [4]:
x_total = total_df.copy(False)
y_total = pd.DataFrame(x_total.pop('TAG'), columns=['TAG'])

len(x_total)

835200

## 3. 시계열 예측 모델

### 3-1. Scaler 준비

In [5]:
series_scaler = Scaler(scaler=MinMaxScaler())

train_x_series = TimeSeries.from_dataframe(x_total.loc[trained_index, :], freq='6S', fill_missing_dates=True)
total_x_series = TimeSeries.from_dataframe(x_total, freq='6S', fill_missing_dates=True)
total_y_series = TimeSeries.from_dataframe(y_total, freq='6S', fill_missing_dates=True)

series_scaler.fit(train_x_series)

total_x_scaled = series_scaler.transform(total_x_series)

len(total_x_scaled) == len(total_y_series)

True

### 3-2. 모델 준비

In [6]:
from darts.models import TransformerModel

model_name = 'transformer'

forecast_window_size = 30

model = TransformerModel(
    input_chunk_length=forecast_window_size,
    output_chunk_length=1,
    batch_size=512,
    dropout=0.1,
    d_model=16,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dim_feedforward=16,
    model_name=model_name,
    activation="relu",
    random_state=42,
    pl_trainer_kwargs={
        "accelerator": "gpu",
        "devices": [4]
    }
)

model = model.load(f'./public/models/forecasting/{model.model_name}.pt')

### 3-3. 4월 마지막 주 데이터 추론

In [7]:
min_size = 10
hour_size = min_size * 60
day_size = hour_size * 24
week_size = day_size * 7

pred = model.predict(
    n=week_size,
    series=total_x_scaled[:len(total_df[:-week_size])],
    verbose=False
)

2022-11-08 15:31:11 pytorch_lightning.utilities.distributed INFO: GPU available: True, used: True
2022-11-08 15:31:11 pytorch_lightning.utilities.distributed INFO: TPU available: False, using: 0 TPU cores
2022-11-08 15:31:11 pytorch_lightning.utilities.distributed INFO: IPU available: False, using: 0 IPUs
2022-11-08 15:31:11 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


In [8]:
pred.time_index

DatetimeIndex(['2020-04-24 00:00:00', '2020-04-24 00:00:06',
               '2020-04-24 00:00:12', '2020-04-24 00:00:18',
               '2020-04-24 00:00:24', '2020-04-24 00:00:30',
               '2020-04-24 00:00:36', '2020-04-24 00:00:42',
               '2020-04-24 00:00:48', '2020-04-24 00:00:54',
               ...
               '2020-04-30 23:59:00', '2020-04-30 23:59:06',
               '2020-04-30 23:59:12', '2020-04-30 23:59:18',
               '2020-04-30 23:59:24', '2020-04-30 23:59:30',
               '2020-04-30 23:59:36', '2020-04-30 23:59:42',
               '2020-04-30 23:59:48', '2020-04-30 23:59:54'],
              dtype='datetime64[ns]', name='time', length=100800, freq='6S')

In [9]:
forecasted_df = pred.pd_dataframe()

ground_truth = total_y_series[pred.time_index].values().reshape(-1, 1).astype(int)

forecasted_df['GT'] = ground_truth

forecasted_df

component,MELT_TEMP,MOTORSPEED,GT
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-24 00:00:00,0.329828,0.067903,0
2020-04-24 00:00:06,0.254201,0.026903,0
2020-04-24 00:00:12,0.328379,0.091429,0
2020-04-24 00:00:18,0.159845,0.109485,0
2020-04-24 00:00:24,0.854719,0.970752,0
...,...,...,...
2020-04-30 23:59:30,0.854896,0.970295,0
2020-04-30 23:59:36,0.159289,0.110638,0
2020-04-30 23:59:42,0.327158,0.090931,0
2020-04-30 23:59:48,0.255575,0.026027,0


## 4. LIGHTGBM + CATBOOST 모델 준비 + 추론

In [10]:
from joblib import load

folder_root = './public/models/tree_detection'

lgbm = load(f'{folder_root}/lgbm.pkl')
cat = load(f'{folder_root}/cat.pkl')

In [22]:
lgbm_pred = lgbm.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']])
cat_pred = cat.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']]) == 'True'

forecasted_df['LGBM_PRED'] = lgbm_pred.astype(int)
forecasted_df['CAT_PRED'] = cat_pred.astype(int)

In [12]:
forecasted_df

component,MELT_TEMP,MOTORSPEED,GT,LGBM_PRED,CAT_PRED
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-24 00:00:00,0.329828,0.067903,0,0,0
2020-04-24 00:00:06,0.254201,0.026903,0,1,1
2020-04-24 00:00:12,0.328379,0.091429,0,0,0
2020-04-24 00:00:18,0.159845,0.109485,0,1,1
2020-04-24 00:00:24,0.854719,0.970752,0,0,0
...,...,...,...,...,...
2020-04-30 23:59:30,0.854896,0.970295,0,0,0
2020-04-30 23:59:36,0.159289,0.110638,0,1,1
2020-04-30 23:59:42,0.327158,0.090931,0,0,0
2020-04-30 23:59:48,0.255575,0.026027,0,1,1


## 5. BI-LSTM 모델 준비 + 추론

In [13]:
h_size = 8

detection_window_size = 10

def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data[i:i+window_size, :]))
        label_list.append(label[i+window_size])
    return np.array(feature_list), np.array(label_list)

class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=detection_window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

In [14]:
past_data_for_window = total_x_scaled[len(total_df[:-week_size])-10:len(total_df[:-week_size])].values()
past_y_data_for_window = total_y_series[len(total_df[:-week_size])-10:len(total_df[:-week_size])].values()

forecasted_data = forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']].values
forecasted_y_data = forecasted_df.loc[:, ['GT']].values

input_x_data = np.concatenate([past_data_for_window, forecasted_data])
input_y_data = np.concatenate([past_y_data_for_window, forecasted_y_data])

input_x_data, input_y_data = make_dataset(input_x_data, input_y_data, detection_window_size)

len(input_x_data) == len(input_y_data)

True

In [15]:
x_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_x_data),
    batch_size=2048,
    shuffle=False
)

y_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_y_data),
    batch_size=2048,
    shuffle=False
)

In [16]:
device = torch.device("cuda:4")
best_model = MockUpModel().to(device)

best_model.load_state_dict(torch.load('./public/models/nn_detection/model.pt'))
best_model.eval()

MockUpModel(
  (model): ModuleDict(
    (lstm): LSTM(2, 8, batch_first=True, dropout=0.15, bidirectional=True)
    (linear1): Linear(in_features=16, out_features=1, bias=True)
    (linear2): Linear(in_features=10, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)

In [17]:
result = []

for x, y in tqdm(zip(x_dataloader, y_dataloader)):
    with torch.no_grad():
        res = best_model(x.to(device)).cpu().numpy()
        res = (res >= 0.5).astype(int)
        result.extend(res)
forecasted_df['NN_PRED'] = np.array(result)

0it [00:00, ?it/s]

In [18]:
forecasted_df['FINAL'] = (((forecasted_df['LGBM_PRED'] + forecasted_df['CAT_PRED'] + forecasted_df['NN_PRED']) / 3) >= 0.5).astype(int)

forecasted_df

component,MELT_TEMP,MOTORSPEED,GT,LGBM_PRED,CAT_PRED,NN_PRED,FINAL
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-24 00:00:00,0.329828,0.067903,0,0,0,1,0
2020-04-24 00:00:06,0.254201,0.026903,0,1,1,1,1
2020-04-24 00:00:12,0.328379,0.091429,0,0,0,1,0
2020-04-24 00:00:18,0.159845,0.109485,0,1,1,1,1
2020-04-24 00:00:24,0.854719,0.970752,0,0,0,0,0
...,...,...,...,...,...,...,...
2020-04-30 23:59:30,0.854896,0.970295,0,0,0,0,0
2020-04-30 23:59:36,0.159289,0.110638,0,1,1,1,1
2020-04-30 23:59:42,0.327158,0.090931,0,0,0,1,0
2020-04-30 23:59:48,0.255575,0.026027,0,1,1,1,1


In [19]:
pred_df = forecasted_df.loc[:, ['FINAL']]
real_df = forecasted_df.loc[:, ['GT']]

In [20]:
p = precision_score(real_df, pred_df)
r = recall_score(real_df, pred_df)
f1 = f1_score(real_df, pred_df)
acc = accuracy_score(real_df, pred_df)

In [21]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.0000', '0.0000', '0.0000', '0.6000')