In [1]:
from darts import TimeSeries
from datetime import datetime

from imblearn.over_sampling import SMOTE
from dateutil.parser import parse
from matplotlib.pylab import rcParams
from tqdm import tqdm_notebook as tqdm
from statsmodels.tsa.stattools import adfuller
from torch.utils.tensorboard import SummaryWriter
from darts.utils.statistics import check_seasonality, plot_acf

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
from darts.metrics import mape
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
rcParams['figure.figsize'] = 15, 5

In [2]:
df = pd.read_csv('./public/data/raw_data.csv')

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [3]:
val_start_day = 25

train_index = (df.index.month == 3) & (df.index.day < val_start_day)
val_index = (df.index.month == 3) & (df.index.day >= val_start_day)

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-03-04 00:00:00      489.0       116.0  False
 2020-03-04 00:00:06      433.0        78.0  False
 2020-03-04 00:00:12      464.0       154.0  False
 2020-03-04 00:00:18      379.0       212.0  False
 2020-03-04 00:00:24      798.0      1736.0  False
 ...                        ...         ...    ...
 2020-03-24 23:59:30      722.0      1728.0  False
 2020-03-24 23:59:36      400.0       204.0   True
 2020-03-24 23:59:42      452.0       198.0   True
 2020-03-24 23:59:48      455.0        86.0   True
 2020-03-24 23:59:54      472.0       110.0  False
 
 [302400 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-03-25 00:00:00      478.0       138.0  False
 2020-03-25 00:00:06      408.0        78.0   True
 2020-03-25 00:00:12      473.0       163.0  False
 2020-03-25 00:00:18      407.0       203.0   True
 2020-03-25 00:00:24      763.0      1724.0  False
 ...                        ...         ...    ...
 

In [4]:
x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_scaled = scaler.fit_transform(x_train)
val_scaled = scaler.fit_transform(x_val)

train_scaled

array([[0.34541982, 0.06455203],
       [0.23854965, 0.04340568],
       [0.29770994, 0.08569839],
       ...,
       [0.27480918, 0.11018364],
       [0.28053433, 0.04785754],
       [0.31297708, 0.06121314]], dtype=float32)

In [6]:
smote = SMOTE(random_state=0)

train_scaled, train_y_over = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

train_scaled_over.shape

y_train = pd.DataFrame(
    train_y_over,
    columns=['TAG']
)

In [7]:
x_train = pd.DataFrame(
    train_scaled,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

x_val = pd.DataFrame(
    val_scaled,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

In [8]:
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(label.iloc[i+window_size])
    return np.array(feature_list), np.array(label_list)

In [9]:
from torch.utils.data import DataLoader, TensorDataset

window_size = 10
x_train, y_train = make_dataset(x_train, y_train, window_size)

x_val, y_val = make_dataset(x_val, y_val, window_size)

In [10]:
from tqdm import tqdm
from torch.nn import BCELoss

h_size = 8
class MockUpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=2,
                hidden_size=h_size,
                dropout=0.15,
                num_layers=1,
                batch_first=True,
                bidirectional=True
            ),
            'linear1': nn.Linear(in_features=h_size*2, out_features=1),
            'linear2': nn.Linear(in_features=window_size, out_features=1),
            'sigmoid': nn.Sigmoid()
        })

    def forward(self, x):
        out, _ = self.model['lstm'](x)
        out = self.model['linear1'](out)
        out = out[:, :, -1]
        out = self.model['linear2'](out)
        out = self.model['sigmoid'](out)
        return out

In [11]:
train_bs = 256
val_bs = 1024

x_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_train),
    batch_size=train_bs,
    shuffle=False
)

y_train_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_train),
    batch_size=train_bs,
    shuffle=False
)

x_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_val),
    batch_size=val_bs,
    shuffle=False
)

y_val_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_val),
    batch_size=val_bs,
    shuffle=False
)

In [12]:
device = torch.device("cuda:4")
model = MockUpModel().to(device)
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = BCELoss()

min_valid = 1e9

for i in range(20):
    batch_loss = 0.
    model.train()
    for x, y in tqdm(zip(x_train_dataloader, y_train_dataloader)):
        optimizer.zero_grad()
        out = model(x.to(device))
        loss = loss_fn(out, y.to(device))
        loss.backward()
        optimizer.step()
        batch_loss += loss.cpu().item()
    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(model(x.to(device)), y.to(device)).cpu().item() for x, y in zip(x_val_dataloader, y_val_dataloader))
    batch_loss /= len(x_train_dataloader)
    valid_loss /= len(x_val_dataloader)

    if min_valid >= valid_loss:
        min_valid = valid_loss
        torch.save(model.state_dict(), './detection_models/model_tmp.pt')
    print(f'{i}: loss: {batch_loss}, valid: {valid_loss}')

1182it [00:07, 164.89it/s]


0: loss: 0.30290964555188143, valid: 0.6983747801395378


1182it [00:07, 166.72it/s]


1: loss: 0.19035063229442448, valid: 0.5647675551549353


1182it [00:07, 166.00it/s]


2: loss: 0.12571585613253444, valid: 0.6256111946069833


1182it [00:07, 167.41it/s]


3: loss: 0.12010878377749963, valid: 0.6263729619859445


1182it [00:07, 166.76it/s]


4: loss: 0.12781621341583418, valid: 0.6237851619118392


1182it [00:07, 167.19it/s]


5: loss: 0.1218771447296382, valid: 0.623247349051514


1182it [00:07, 166.92it/s]


6: loss: 0.12056126210327589, valid: 0.5989690671364466


1182it [00:07, 167.64it/s]


7: loss: 0.11040673038208985, valid: 0.6268602046701643


1182it [00:07, 167.63it/s]


8: loss: 0.11893188417746367, valid: 0.6228223035130838


1182it [00:07, 166.53it/s]


9: loss: 0.12180543225711643, valid: 0.6122896904596175


1182it [00:07, 166.51it/s]


10: loss: 0.1153029031755502, valid: 0.6165604257222378


1182it [00:07, 166.73it/s]


11: loss: 0.11856083516916438, valid: 0.6109552601672182


1182it [00:07, 166.26it/s]


12: loss: 0.11890033324410218, valid: 0.612521180600831


1182it [00:07, 166.67it/s]


13: loss: 0.11497608425745515, valid: 0.6188464635851407


1182it [00:07, 164.21it/s]


14: loss: 0.12870321450530442, valid: 0.6250121340607152


1182it [00:07, 167.14it/s]


15: loss: 0.12351334949393163, valid: 0.6117925153236197


1182it [00:07, 167.62it/s]


16: loss: 0.11764502772143368, valid: 0.6205178982380665


1182it [00:07, 166.25it/s]


17: loss: 0.12364139527912912, valid: 0.6118787751354352


1182it [00:07, 167.86it/s]


18: loss: 0.11681796974611237, valid: 0.6156726279343018


1182it [00:07, 166.14it/s]


19: loss: 0.12373595386245678, valid: 0.6222471627924178


In [15]:
best_model = MockUpModel()

best_model.load_state_dict(torch.load('./detection_models/model.pt'))
best_model.eval()

MockUpModel(
  (model): ModuleDict(
    (lstm): LSTM(2, 8, batch_first=True, dropout=0.15, bidirectional=True)
    (linear1): Linear(in_features=16, out_features=1, bias=True)
    (linear2): Linear(in_features=10, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)

In [13]:
test_index = (df.index.month == 4) & (df.index.day <= 7)

test_df = df[test_index]

x_test = test_df.copy(False)
y_test = pd.DataFrame(x_test.pop('TAG'), columns=['TAG'])

test_scaled = scaler.fit_transform(x_test)

x_test = pd.DataFrame(
    test_scaled,
    columns=['MELT_TEMP', 'MOTORSPEED']
)

x_test, y_test = make_dataset(x_test, y_test, window_size)

test_bs = 2048

x_test_dataloader = DataLoader(
    dataset=torch.FloatTensor(x_test),
    batch_size=test_bs,
    shuffle=False
)

y_test_dataloader = DataLoader(
    dataset=torch.FloatTensor(y_test),
    batch_size=test_bs,
    shuffle=False
)

In [16]:
best_model.to(device)

test_pred = pd.DataFrame()

result = []
cnt = 0
for x, y in tqdm(zip(x_test_dataloader, y_test_dataloader)):
    with torch.no_grad():
        res = best_model(x.to(device)).cpu().numpy()
        res = (res >= 0.5).astype(int)
        result.extend(res)
result = np.array(result)

50it [00:00, 119.10it/s]


In [17]:
test_pred = pd.DataFrame(result, columns=['TAG'])
test_real = pd.DataFrame(y_test, columns=['TAG'])

test_pred

Unnamed: 0,TAG
0,1
1,1
2,1
3,1
4,0
...,...
100785,0
100786,1
100787,1
100788,1


In [18]:
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

p = precision_score(test_real, test_pred)
r = recall_score(test_real, test_pred)
f1 = f1_score(test_real, test_pred)
acc = accuracy_score(test_real, test_pred)

In [19]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.4771', '0.9998', '0.6460', '0.5817')