# Pipeline

In [32]:
from darts import TimeSeries
from datetime import datetime
from darts.dataprocessing.transformers import Scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

from dateutil.parser import parse
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
import matplotlib.pyplot as plt


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

## 1. Data Load

In [33]:
df = pd.read_csv('./public/data/raw_data.csv')
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


## 2. Data 준비

In [34]:
# 모든 모델은 3월의 데이터만으로 학습 & 평가 하였음.
# 파이프라인의 중요성을 확인하기 위해 학습 데이터와 가까운 일주일의 데이터만 가지고  테스트 할 것임

val_start_day = 25

trained_index = (df.index.month == 3) & (df.index.day < val_start_day)
seened_index = df.index.month == 3
unseened_index = (df.index.month == 4) & (df.index.day <= 7)

total_df = df.copy(False)

In [35]:
x_total = total_df.copy(False)
y_total = pd.DataFrame(x_total.pop('TAG'), columns=['TAG'])

len(x_total)

835200

## 3. 시계열 예측 모델

### 3-1. Scaler 준비

In [36]:
series_scaler = Scaler(scaler=MinMaxScaler())

train_x_series = TimeSeries.from_dataframe(x_total.loc[trained_index, :], freq='6S', fill_missing_dates=True)
total_x_series = TimeSeries.from_dataframe(x_total, freq='6S', fill_missing_dates=True)
total_y_series = TimeSeries.from_dataframe(y_total, freq='6S', fill_missing_dates=True)

series_scaler.fit(train_x_series)

total_x_scaled = series_scaler.transform(total_x_series)

len(total_x_scaled) == len(total_y_series)

True

### 3-2. 모델 준비

In [37]:
from darts.models import TransformerModel

model_name = 'transformer'

forecast_window_size = 30

model = TransformerModel(
    input_chunk_length=forecast_window_size,
    output_chunk_length=1,
    batch_size=512,
    dropout=0.1,
    d_model=16,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dim_feedforward=16,
    model_name=model_name,
    activation="relu",
    random_state=42,
    pl_trainer_kwargs={
        "accelerator": "gpu",
        "devices": [4]
    }
)

model = model.load(f'./public/models/forecasting/{model.model_name}.pt')

### 3-3. 4월1일부터 4월7일 데이터 추론

In [76]:
min_size = 10
hour_size = min_size * 60
day_size = hour_size * 24
week_size = day_size * 7

forecast = []

for i in range(len(total_df[seened_index]), len(total_df[seened_index]) + week_size, min_size):
    pred = model.predict(
        n=10,
        series=total_x_scaled[:i],
        verbose=False
    )
    forecast.extend(pred.values())
forecast = np.array(forecast)

2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:04 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:05 pytorch_lightning.accelerators.gpu INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2022-11-08 18:14:05 pytorch_lightning.accelerators.gpu INFO: LOC

In [81]:
forecast.shape

total_df[unseened_index]

Unnamed: 0_level_0,MELT_TEMP,MOTORSPEED,TAG
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-01 00:00:00,463.0,135.0,False
2020-04-01 00:00:06,412.0,118.0,True
2020-04-01 00:00:12,482.0,178.0,False
2020-04-01 00:00:18,394.0,216.0,True
2020-04-01 00:00:24,721.0,1733.0,False
...,...,...,...
2020-04-07 23:59:30,768.0,1748.0,False
2020-04-07 23:59:36,381.0,202.0,False
2020-04-07 23:59:42,498.0,197.0,False
2020-04-07 23:59:48,471.0,93.0,False


In [82]:
forecasted_df = pd.DataFrame(
    forecast,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

ground_truth = total_y_series[len(total_df[seened_index]):len(total_df[seened_index])+week_size].values().reshape(-1, 1).astype(int)

forecasted_df['GT'] = ground_truth

forecasted_df

Unnamed: 0,MELT_TEMP,MOTORSPEED,GT
0,0.337518,0.077713,0
1,0.262460,0.033822,1
2,0.334585,0.102676,0
3,0.148706,0.114258,1
4,0.856937,0.975693,0
...,...,...,...
100795,0.854949,0.977351,0
100796,0.150760,0.116037,0
100797,0.331556,0.100623,0
100798,0.264395,0.035412,0


## 4. LIGHTGBM + CATBOOST 모델 준비 + 추론

In [83]:
from joblib import load

folder_root = './public/models/tree_detection'

lgbm = load(f'{folder_root}/lgbm.pkl')
cat = load(f'{folder_root}/cat.pkl')

In [84]:
lgbm_pred = lgbm.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']])
cat_pred = cat.predict(forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']]) == 'True'

forecasted_df['LGBM_PRED'] = lgbm_pred.astype(int)
forecasted_df['CAT_PRED'] = cat_pred.astype(int)

In [85]:
forecasted_df

Unnamed: 0,MELT_TEMP,MOTORSPEED,GT,LGBM_PRED,CAT_PRED
0,0.337518,0.077713,0,0,0
1,0.262460,0.033822,1,1,1
2,0.334585,0.102676,0,0,0
3,0.148706,0.114258,1,1,1
4,0.856937,0.975693,0,0,0
...,...,...,...,...,...
100795,0.854949,0.977351,0,0,0
100796,0.150760,0.116037,0,1,1
100797,0.331556,0.100623,0,0,0
100798,0.264395,0.035412,0,1,1


## 5. BI-LSTM 모델 준비 + 추론

In [86]:
h_size = 8

detection_window_size = 10

def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data[i:i+window_size, :]))
        label_list.append(label[i+window_size])
    return np.array(feature_list), np.array(label_list)

class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=detection_window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

In [87]:
past_data_for_window = total_x_scaled[len(total_df[seened_index])-10:len(total_df[seened_index])].values()
past_y_data_for_window = total_y_series[len(total_df[seened_index])-10:len(total_df[seened_index])].values()

forecasted_data = forecasted_df.loc[:, ['MELT_TEMP', 'MOTORSPEED']].values
forecasted_y_data = forecasted_df.loc[:, ['GT']].values

input_x_data = np.concatenate([past_data_for_window, forecasted_data])
input_y_data = np.concatenate([past_y_data_for_window, forecasted_y_data])

input_x_data, input_y_data = make_dataset(input_x_data, input_y_data, detection_window_size)

len(input_x_data) == len(input_y_data)

True

In [88]:
x_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_x_data),
    batch_size=2048,
    shuffle=False
)

y_dataloader = DataLoader(
    dataset=torch.FloatTensor(input_y_data),
    batch_size=2048,
    shuffle=False
)

In [89]:
device = torch.device("cuda:4")
best_model = MockUpModel().to(device)

best_model.load_state_dict(torch.load('./public/models/nn_detection/model.pt'))
best_model.eval()

MockUpModel(
  (model): ModuleDict(
    (lstm): LSTM(2, 8, batch_first=True, dropout=0.15, bidirectional=True)
    (linear1): Linear(in_features=16, out_features=1, bias=True)
    (linear2): Linear(in_features=10, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)

In [None]:
result = []

for x, y in tqdm(zip(x_dataloader, y_dataloader)):
    with torch.no_grad():
        res = best_model(x.to(device)).cpu().numpy()
        res = (res >= 0.5).astype(int)
        result.extend(res)
forecasted_df['NN_PRED'] = np.array(result)

0it [00:00, ?it/s]

In [62]:
forecasted_df['FINAL'] = (((forecasted_df['LGBM_PRED'] + forecasted_df['CAT_PRED'] + forecasted_df['NN_PRED']) / 3) >= 0.5).astype(int)

In [63]:
pred_df = forecasted_df.loc[:, ['FINAL']]
real_df = forecasted_df.loc[:, ['GT']]

In [64]:
p = precision_score(real_df, pred_df)
r = recall_score(real_df, pred_df)
f1 = f1_score(real_df, pred_df)
acc = accuracy_score(real_df, pred_df)

In [95]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.7911', '0.8290', '0.8096', '0.8512')