In [1]:
from darts import TimeSeries
from datetime import datetime

from imblearn.over_sampling import SMOTE
from dateutil.parser import parse
from matplotlib.pylab import rcParams
from tqdm import tqdm_notebook as tqdm
from statsmodels.tsa.stattools import adfuller
from torch.utils.tensorboard import SummaryWriter
from darts.utils.statistics import check_seasonality, plot_acf

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
from darts.metrics import mape
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
rcParams['figure.figsize'] = 15, 5

In [2]:
df = pd.read_csv('./public/data/raw_data.csv')

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [25]:
train_index = ((df.index.month == 3) & (df.index.day >= 25))
val_index = (df.index.month == 4) & (df.index.day < 5)

# 기존 학습 데이터를 알아야 Scaler를 구할 수 있음.
# fine tuning 시 scaler는 기존 scaler로 사용
existing_df = df[(df.index.month == 3) & (df.index.day < 25)]

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-03-25 00:00:00      478.0       138.0  False
 2020-03-25 00:00:06      408.0        78.0   True
 2020-03-25 00:00:12      473.0       163.0  False
 2020-03-25 00:00:18      407.0       203.0   True
 2020-03-25 00:00:24      763.0      1724.0  False
 ...                        ...         ...    ...
 2020-03-31 23:59:30      755.0      1732.0  False
 2020-03-31 23:59:36      420.0       213.0   True
 2020-03-31 23:59:42      458.0       161.0   True
 2020-03-31 23:59:48      421.0         0.0   True
 2020-03-31 23:59:54      514.0       141.0  False
 
 [100800 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-04-01 00:00:00      463.0       135.0  False
 2020-04-01 00:00:06      412.0       118.0   True
 2020-04-01 00:00:12      482.0       178.0  False
 2020-04-01 00:00:18      394.0       216.0   True
 2020-04-01 00:00:24      721.0      1733.0  False
 ...                        ...         ...    ...
 

In [26]:
existing_x_df = existing_df.copy(False)
existing_y_df = pd.DataFrame(existing_x_df.pop('TAG'), columns=['TAG'])

x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_train, y_val

(                     MELT_TEMP  MOTORSPEED
 2020-03-25 00:00:00      478.0       138.0
 2020-03-25 00:00:06      408.0        78.0
 2020-03-25 00:00:12      473.0       163.0
 2020-03-25 00:00:18      407.0       203.0
 2020-03-25 00:00:24      763.0      1724.0
 ...                        ...         ...
 2020-03-31 23:59:30      755.0      1732.0
 2020-03-31 23:59:36      420.0       213.0
 2020-03-31 23:59:42      458.0       161.0
 2020-03-31 23:59:48      421.0         0.0
 2020-03-31 23:59:54      514.0       141.0
 
 [100800 rows x 2 columns],
                        TAG
 2020-04-01 00:00:00  False
 2020-04-01 00:00:06   True
 2020-04-01 00:00:12  False
 2020-04-01 00:00:18   True
 2020-04-01 00:00:24  False
 ...                    ...
 2020-04-04 23:59:30  False
 2020-04-04 23:59:36   True
 2020-04-04 23:59:42  False
 2020-04-04 23:59:48   True
 2020-04-04 23:59:54  False
 
 [57600 rows x 1 columns])

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(existing_x_df)

train_scaled = scaler.transform(x_train)
val_scaled = scaler.transform(x_val)

val_scaled.shape

(57600, 2)

In [28]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

train_scaled_over, y_train_over = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

y_train.shape[0], y_train_over.shape[0]

(100800, 146600)

In [29]:
x_train = pd.DataFrame(
    train_scaled_over,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

y_train = pd.DataFrame(
    y_train_over,
    columns=['TAG']
)

x_val = pd.DataFrame(
    val_scaled,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

y_val = y_val

In [30]:
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(label.iloc[i+window_size])
    return np.array(feature_list), np.array(label_list)

In [31]:
from torch.utils.data import DataLoader, TensorDataset

window_size = 10
x_train, y_train = make_dataset(x_train, y_train, window_size)

x_val, y_val = make_dataset(x_val, y_val, window_size)

In [32]:
from tqdm import tqdm
from torch.nn import BCELoss

h_size = 8
class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

In [33]:
model = MockUpModel()

model.load_state_dict(torch.load('./public/models/nn_detection/model.pt'))

<All keys matched successfully>

In [34]:
train_bs = 256
val_bs = 1024

x_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_train),
    batch_size=train_bs,
    shuffle=False
)

y_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_train),
    batch_size=train_bs,
    shuffle=False
)

x_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_val),
    batch_size=val_bs,
    shuffle=False
)

y_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_val),
    batch_size=val_bs,
    shuffle=False
)

In [36]:
device = torch.device("cuda:5")
model = model.to(device)
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = BCELoss()

min_valid = 1e9

for i in range(25):
    batch_loss = 0.
    model.train()
    for x, y in tqdm(zip(x_train_dataloader, y_train_dataloader)):
        optimizer.zero_grad()
        out = model(x.to(device))
        loss = loss_fn(out, y.to(device))
        loss.backward()
        optimizer.step()
        batch_loss += loss.cpu().item()
    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(model(x.to(device)), y.to(device)).cpu().item() for x, y in zip(x_val_dataloader, y_val_dataloader))
    batch_loss /= len(x_train_dataloader)
    valid_loss /= len(x_val_dataloader)

    if min_valid >= valid_loss:
        min_valid = valid_loss
        torch.save(model.state_dict(), './public/models/nn_detection/model_fine_tuned.pt')
    print(f'{i}: loss: {batch_loss}, valid: {valid_loss}')

573it [00:03, 160.33it/s]


0: loss: 0.09882287872669955, valid: 0.32395305863597934


573it [00:03, 159.81it/s]


1: loss: 0.09513040010975414, valid: 0.30265646702364873


573it [00:03, 158.99it/s]


2: loss: 0.10347669425986895, valid: 0.30373657873848026


573it [00:03, 160.31it/s]


3: loss: 0.10658753956859057, valid: 0.288511208797756


573it [00:03, 158.77it/s]


4: loss: 0.1091045377205937, valid: 0.291050652662913


573it [00:03, 158.84it/s]


5: loss: 0.09846286973440581, valid: 0.35501290413371306


573it [00:03, 158.92it/s]


6: loss: 0.0919261188467763, valid: 0.2755194722037566


573it [00:03, 158.54it/s]


7: loss: 0.09285414410327572, valid: 0.3108008132691969


573it [00:03, 160.08it/s]


8: loss: 0.11041437000374311, valid: 0.2794652581214905


573it [00:03, 162.19it/s]


9: loss: 0.09803644359521373, valid: 0.31800580913560433


573it [00:03, 160.96it/s]


10: loss: 0.0976030604239719, valid: 0.28612771551860006


573it [00:03, 161.25it/s]


11: loss: 0.09477413526328238, valid: 0.31523684765163223


573it [00:03, 159.85it/s]


12: loss: 0.1101050253764094, valid: 0.3172520367722762


573it [00:03, 158.24it/s]


13: loss: 0.10667464542499278, valid: 0.3867466502022325


573it [00:03, 158.84it/s]


14: loss: 0.10983342433674992, valid: 0.3299620073092611


573it [00:03, 158.26it/s]


15: loss: 0.11309214294473616, valid: 0.4602018683625941


573it [00:03, 158.63it/s]


16: loss: 0.10942516657527875, valid: 0.3886584064416718


573it [00:03, 159.32it/s]


17: loss: 0.1123146101014435, valid: 0.3019476088515499


573it [00:03, 156.64it/s]


18: loss: 0.11159780465292356, valid: 0.4896730508720666


573it [00:03, 159.62it/s]


19: loss: 0.10808395336416224, valid: 0.54828304732055


573it [00:03, 158.04it/s]


20: loss: 0.11262628212969354, valid: 0.300390047462363


573it [00:03, 158.73it/s]


21: loss: 0.10609892015280381, valid: 0.296045111982446


573it [00:03, 160.05it/s]


22: loss: 0.11150505740226395, valid: 0.3238895467498846


573it [00:03, 161.47it/s]


23: loss: 0.1092701419406972, valid: 0.4289547361825642


573it [00:03, 161.18it/s]


24: loss: 0.10380089981542383, valid: 0.3308531579218413


573it [00:03, 161.03it/s]


25: loss: 0.1030565087080496, valid: 0.30199858180263583


573it [00:03, 163.32it/s]


26: loss: 0.10646223624839884, valid: 0.28319417959765386


573it [00:03, 161.71it/s]


27: loss: 0.10843141098256039, valid: 0.3891581777940717


573it [00:03, 160.81it/s]


28: loss: 0.11145124243012423, valid: 0.3465413639419957


573it [00:03, 161.08it/s]


29: loss: 0.11481036336151444, valid: 0.2782643573325977
