In [1]:
from darts import TimeSeries
from datetime import datetime

from imblearn.over_sampling import SMOTE
from dateutil.parser import parse
from matplotlib.pylab import rcParams
from tqdm import tqdm_notebook as tqdm
from statsmodels.tsa.stattools import adfuller
from torch.utils.tensorboard import SummaryWriter
from darts.utils.statistics import check_seasonality, plot_acf

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
from darts.metrics import mape
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
rcParams['figure.figsize'] = 15, 5

In [2]:
df = pd.read_csv('./public/data/raw_data.csv')

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [3]:
train_index = (df.index.month == 4) & (df.index.day < 17)
val_index = (df.index.month == 4) & (df.index.day >= 17) & (df.index.day < 24)

# 기존 학습 데이터를 알아야 Scaler를 구할 수 있음.
# fine tuning 시 scaler는 기존 scaler로 사용
existing_df = df[(df.index.month == 3) & (df.index.day < 25)]

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-04-01 00:00:00      463.0       135.0  False
 2020-04-01 00:00:06      412.0       118.0   True
 2020-04-01 00:00:12      482.0       178.0  False
 2020-04-01 00:00:18      394.0       216.0   True
 2020-04-01 00:00:24      721.0      1733.0  False
 ...                        ...         ...    ...
 2020-04-16 23:59:30      724.0      1734.0  False
 2020-04-16 23:59:36      413.0       190.0  False
 2020-04-16 23:59:42      500.0       167.0  False
 2020-04-16 23:59:48      450.0        57.0  False
 2020-04-16 23:59:54      485.0       144.0  False
 
 [230400 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-04-17 00:00:00      478.0       121.0  False
 2020-04-17 00:00:06      444.0         0.0  False
 2020-04-17 00:00:12      484.0       165.0  False
 2020-04-17 00:00:18      387.0       194.0  False
 2020-04-17 00:00:24      707.0      1743.0  False
 ...                        ...         ...    ...
 

In [4]:
existing_x_df = existing_df.copy(False)
existing_y_df = pd.DataFrame(existing_x_df.pop('TAG'), columns=['TAG'])

x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_train, y_val

(                     MELT_TEMP  MOTORSPEED
 2020-04-01 00:00:00      463.0       135.0
 2020-04-01 00:00:06      412.0       118.0
 2020-04-01 00:00:12      482.0       178.0
 2020-04-01 00:00:18      394.0       216.0
 2020-04-01 00:00:24      721.0      1733.0
 ...                        ...         ...
 2020-04-16 23:59:30      724.0      1734.0
 2020-04-16 23:59:36      413.0       190.0
 2020-04-16 23:59:42      500.0       167.0
 2020-04-16 23:59:48      450.0        57.0
 2020-04-16 23:59:54      485.0       144.0
 
 [230400 rows x 2 columns],
                        TAG
 2020-04-17 00:00:00  False
 2020-04-17 00:00:06  False
 2020-04-17 00:00:12  False
 2020-04-17 00:00:18  False
 2020-04-17 00:00:24  False
 ...                    ...
 2020-04-23 23:59:30  False
 2020-04-23 23:59:36  False
 2020-04-23 23:59:42  False
 2020-04-23 23:59:48  False
 2020-04-23 23:59:54  False
 
 [100800 rows x 1 columns])

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(existing_x_df)

train_scaled = scaler.transform(x_train)
val_scaled = scaler.transform(x_val)

val_scaled.shape

(100800, 2)

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

train_scaled_over, y_train_over = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

y_train.shape[0], y_train_over.shape[0]

(230400, 336508)

In [8]:
x_train = pd.DataFrame(
    train_scaled_over,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

y_train = pd.DataFrame(
    y_train_over,
    columns=['TAG']
)

x_val = pd.DataFrame(
    val_scaled,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

y_val = y_val

In [9]:
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(label.iloc[i+window_size])
    return np.array(feature_list), np.array(label_list)

In [10]:
from torch.utils.data import DataLoader, TensorDataset

window_size = 10
x_train, y_train = make_dataset(x_train, y_train, window_size)

x_val, y_val = make_dataset(x_val, y_val, window_size)

In [11]:
from tqdm import tqdm
from torch.nn import BCELoss

h_size = 8
class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

In [13]:
model = MockUpModel()

model.load_state_dict(torch.load('./public/models/nn_detection/model.pt'))

<All keys matched successfully>

In [15]:
train_bs = 256
val_bs = 1024

x_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_train),
    batch_size=train_bs,
    shuffle=False
)

y_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_train),
    batch_size=train_bs,
    shuffle=False
)

x_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_val),
    batch_size=val_bs,
    shuffle=False
)

y_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_val),
    batch_size=val_bs,
    shuffle=False
)

In [16]:
device = torch.device("cuda:4")
model = model.to(device)
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = BCELoss()

min_valid = 1e9

for i in range(20):
    batch_loss = 0.
    model.train()
    for x, y in tqdm(zip(x_train_dataloader, y_train_dataloader)):
        optimizer.zero_grad()
        out = model(x.to(device))
        loss = loss_fn(out, y.to(device))
        loss.backward()
        optimizer.step()
        batch_loss += loss.cpu().item()
    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(model(x.to(device)), y.to(device)).cpu().item() for x, y in zip(x_val_dataloader, y_val_dataloader))
    batch_loss /= len(x_train_dataloader)
    valid_loss /= len(x_val_dataloader)

    if min_valid >= valid_loss:
        min_valid = valid_loss
        torch.save(model.state_dict(), './public/models/nn_detection/model_fine_tuned.pt')
    print(f'{i}: loss: {batch_loss}, valid: {valid_loss}')

1315it [00:08, 160.75it/s]


0: loss: 0.2093229747340721, valid: 3.203697939111729


1315it [00:08, 161.65it/s]


1: loss: 0.20076241344271306, valid: 2.1976228266051323


1315it [00:08, 161.35it/s]


2: loss: 0.18935682382601704, valid: 0.92845931558898


1315it [00:08, 160.19it/s]


3: loss: 0.18973683794353294, valid: 0.46577090235671614


1315it [00:08, 160.86it/s]


4: loss: 0.1824037749259184, valid: 0.19976710731332953


1315it [00:08, 159.72it/s]


5: loss: 0.18092912802013025, valid: 0.27033448971883217


1315it [00:08, 160.67it/s]


6: loss: 0.17911423093123854, valid: 0.09603725223228185


1315it [00:08, 160.66it/s]


7: loss: 0.1801697689889374, valid: 0.4751879229690089


1315it [00:08, 161.00it/s]


8: loss: 0.17902442394957627, valid: 0.23502185895587457


1315it [00:08, 160.33it/s]


9: loss: 0.18824252109316633, valid: 1.7018498254544807


1315it [00:08, 160.58it/s]


10: loss: 0.18136635032927745, valid: 0.6933872284311237


1315it [00:08, 160.30it/s]


11: loss: 0.1758893746155856, valid: 0.08469169511638507


1315it [00:08, 160.37it/s]


12: loss: 0.18534209412651423, valid: 0.15797473911685173


1315it [00:08, 160.10it/s]


13: loss: 0.1817296272523178, valid: 0.02752704005876575


1315it [00:08, 159.95it/s]


14: loss: 0.18461042687811663, valid: 0.019697529461347695


1315it [00:08, 161.67it/s]


15: loss: 0.1878620256491434, valid: 0.024885726979736126


1315it [00:08, 161.59it/s]


16: loss: 0.19232533421648929, valid: 0.19849141664577252


1315it [00:08, 160.47it/s]


17: loss: 0.19186703911141453, valid: 0.20101696282926232


1315it [00:08, 160.67it/s]


18: loss: 0.19096913397876092, valid: 0.8734353293072094


1315it [00:08, 159.28it/s]


19: loss: 0.17874720264615726, valid: 0.07640407618248102
