In [27]:
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score
from sklearn.utils import _safe_indexing
from imblearn.over_sampling import SMOTE

import os
import torch
import shutil
import warnings
import itertools
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [28]:
df = pd.read_csv('./public/data/raw_data.csv')

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   MELT_TEMP    835200 non-null  float32
 1   MOTORSPEED   835200 non-null  float32
 2   MELT_WEIGHT  835200 non-null  float32
 3   INSP         835200 non-null  float32
 4   TAG          835200 non-null  bool   
dtypes: bool(1), float32(4)
memory usage: 19.9 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED,MELT_WEIGHT,INSP
count,835200.0,835200.0,835200.0,835200.0
mean,509.200714,459.78302,582.961975,3.194854
std,128.277512,639.436401,1217.604492,0.011822
min,308.0,0.0,0.0,3.17
25%,430.0,119.0,186.0,3.19
50%,469.0,168.0,383.0,3.19
75%,502.0,218.0,583.0,3.2
max,832.0,1804.0,55252.0,3.23


In [29]:
df_ = df.copy(False)

index = (df_.index.month == 3) | (df_.index.day <= 10)

train_df = df_[index]
test_df = df_[~index]

In [30]:
train_y_df = train_df['TAG']
train_x_df = train_df.drop(columns='TAG')

test_y_df = test_df['TAG']
test_x_df = test_df.drop(columns='TAG')

len(train_x_df), len(test_x_df)

(547200, 288000)

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_x_df = pd.DataFrame(
    scaler.fit_transform(train_x_df),
    columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP']
)
test_x_df = pd.DataFrame(
    scaler.transform(test_x_df),
    columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP']
)

train_x_df.describe()

Unnamed: 0,MELT_TEMP,MOTORSPEED,MELT_WEIGHT,INSP
count,547200.0,547200.0,547200.0,547200.0
mean,0.383943,0.255865,0.009536,0.414323
std,0.244832,0.355839,0.019297,0.197006
min,0.0,0.0,0.0,0.0
25%,0.232824,0.066221,0.003258,0.333336
50%,0.307252,0.093489,0.006715,0.333336
75%,0.370229,0.121313,0.01028,0.5
max,1.0,1.0,1.0,1.0


In [39]:
smote = SMOTE(random_state=0, k_neighbors=8)

train_x_df, train_y_df = smote.fit_resample(train_x_df, train_y_df)

train_x_df = pd.DataFrame(
    train_x_df,
    columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP'],
)

train_y_df = pd.DataFrame(
    train_y_df,
    columns=['TAG']
)

train_y_df.value_counts()

TAG  
False    383700
True     383700
dtype: int64

In [42]:
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD

# initialized a group of outlier detectors for acceleration
detector_list = [LOF(n_neighbors=2),
                 LOF(n_neighbors=5),
                 LOF(n_neighbors=8),
                 IForest(n_estimators=100),
                 IForest(n_estimators=200)]

clf = SUOD(
    base_estimators=detector_list,
    n_jobs=1,
    combination='average'
)

In [43]:
clf.fit(train_x_df, y=train_y_df)




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.4min finished


SUOD(approx_clf=None, approx_clf_list=None, approx_flag_global=True,
   approx_ng_clf_list=None,
   base_estimators=[LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=2, novelty=True, p=2), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=5, novelty=True, p=2),...features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=None,
    verbose=0)],
   bps_flag=True, combination='average', contamination=0.1,
   cost_forecast_loc_fit=None, cost_forecast_loc_pred=None,
   jl_method='basic', n_jobs=1, rp_clf_list=None, rp_flag_global=True,
   rp_ng_clf_list=None, target_dim_frac=0.5, verbose=False)

In [44]:
prediction = clf.predict(test_x_df)




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.3s finished


In [45]:
real = test_y_df.values.squeeze().astype(int)
real_df = pd.DataFrame(real, columns=['TAG'])
pred_df = pd.DataFrame(prediction, columns=['TAG'])

In [46]:
p = precision_score(real_df, pred_df)
r = recall_score(real_df, pred_df)
f1 = f1_score(real_df, pred_df)
acc = accuracy_score(real_df, pred_df)

In [47]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.0032', '0.0100', '0.0048', '0.8041')

In [51]:
real_df[((pred_df['TAG'] == 0) & (real_df['TAG'] == 1))]

Unnamed: 0,TAG
2,1
8,1
11,1
18,1
19,1
...,...
51307,1
51351,1
51367,1
51397,1
