In [79]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import auc, roc_auc_score, roc_curve 
from sklearn.metrics import f1_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

import pandas as pd
import numpy as np
import scipy.stats.distributions as dists

import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

%matplotlib inline
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [80]:
df = pd.read_csv('./public/data/raw_data.csv')
use_cols = ['MELT_TEMP', 'MOTORSPEED']

df['TAG'] = df['TAG'] == 'NG'

df.index = pd.date_range(start='3/4/2020', end='5/1/2020', freq='6S')[:-1]

df.drop(columns=['STD_DT', 'NUM', 'MELT_WEIGHT', 'INSP'], inplace=True)

df = df.astype(np.float32)
df['TAG'] = df['TAG'].astype(bool)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 835200 entries, 2020-03-04 00:00:00 to 2020-04-30 23:59:54
Freq: 6S
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   MELT_TEMP   835200 non-null  float32
 1   MOTORSPEED  835200 non-null  float32
 2   TAG         835200 non-null  bool   
dtypes: bool(1), float32(2)
memory usage: 13.5 MB


Unnamed: 0,MELT_TEMP,MOTORSPEED
count,835200.0,835200.0
mean,509.200714,459.78302
std,128.277512,639.436401
min,308.0,0.0
25%,430.0,119.0
50%,469.0,168.0
75%,502.0,218.0
max,832.0,1804.0


In [81]:
val_start_day = 25

train_index = (df.index.month == 3) & (df.index.day < val_start_day)
val_index = (df.index.month == 3) & (df.index.day >= val_start_day)

train_df = df[train_index]
val_df = df[val_index]

train_df, val_df

(                     MELT_TEMP  MOTORSPEED    TAG
 2020-03-04 00:00:00      489.0       116.0  False
 2020-03-04 00:00:06      433.0        78.0  False
 2020-03-04 00:00:12      464.0       154.0  False
 2020-03-04 00:00:18      379.0       212.0  False
 2020-03-04 00:00:24      798.0      1736.0  False
 ...                        ...         ...    ...
 2020-03-24 23:59:30      722.0      1728.0  False
 2020-03-24 23:59:36      400.0       204.0   True
 2020-03-24 23:59:42      452.0       198.0   True
 2020-03-24 23:59:48      455.0        86.0   True
 2020-03-24 23:59:54      472.0       110.0  False
 
 [302400 rows x 3 columns],
                      MELT_TEMP  MOTORSPEED    TAG
 2020-03-25 00:00:00      478.0       138.0  False
 2020-03-25 00:00:06      408.0        78.0   True
 2020-03-25 00:00:12      473.0       163.0  False
 2020-03-25 00:00:18      407.0       203.0   True
 2020-03-25 00:00:24      763.0      1724.0  False
 ...                        ...         ...    ...
 

In [82]:
x_train = train_df.copy(False)
y_train = pd.DataFrame(x_train.pop('TAG'), columns=['TAG'])

x_val = val_df.copy(False)
y_val = pd.DataFrame(x_val.pop('TAG'), columns=['TAG'])

x_train

Unnamed: 0,MELT_TEMP,MOTORSPEED
2020-03-04 00:00:00,489.0,116.0
2020-03-04 00:00:06,433.0,78.0
2020-03-04 00:00:12,464.0,154.0
2020-03-04 00:00:18,379.0,212.0
2020-03-04 00:00:24,798.0,1736.0
...,...,...
2020-03-24 23:59:30,722.0,1728.0
2020-03-24 23:59:36,400.0,204.0
2020-03-24 23:59:42,452.0,198.0
2020-03-24 23:59:48,455.0,86.0


In [83]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_scaled = scaler.fit_transform(x_train)
val_scaled = scaler.fit_transform(x_val)

val_scaled.shape

(100800, 2)

In [84]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

train_scaled_over, y_train_over = smote.fit_resample(
    X=train_scaled,
    y=y_train.values.squeeze()
)

y_train.shape[0], y_train_over.shape[0]

(302400, 521558)

In [85]:
x_train = pd.DataFrame(
    train_scaled_over,
    columns=use_cols
)

y_train = pd.DataFrame(
    y_train_over,
    columns=['TAG']
)

x_val = pd.DataFrame(
    val_scaled,
    columns=use_cols
)

y_val = y_val

In [86]:
pars = {
        "learning_rate": dists.uniform(0.01, 0.1),
        "boosting_type": ['gbdt', 'dart', 'goss', 'rf'],
        "reg_alpha": dists.uniform(0.1, 1.),
        "reg_lambda": dists.uniform(0.1, 1.),
        "random_state": [0],
    }

lgbm_clf = LGBMClassifier(random_state=0)
lgbm_rcv_ = RandomizedSearchCV(lgbm_clf, param_distributions=pars, n_iter=15, cv=5, refit=True, random_state=0)

lgbm_rcv_.fit(x_train, y_train), 

lgbm_rcv_.best_estimator_

LGBMClassifier(learning_rate=0.07818202991034834, random_state=0,
               reg_alpha=0.459507900573786, reg_lambda=0.5370319537993414)

In [87]:
pars = {'iterations':100,
          'random_seed':0,
          'learning_rate':0.01,
          'loss_function':'Logloss',
          'custom_metric':['Logloss','AUC'],
          'early_stopping_rounds':20,
          'task_type':"GPU",
          'bagging_temperature':1,
          'verbose':False,
              }

train_pool = Pool(x_train, y_train)

cat_clf = CatBoostClassifier(**pars)
cat_clf.fit(train_pool)

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x27674345160>

In [88]:
lgbm_res = lgbm_rcv_.best_estimator_.predict(x_val)
cat_res = cat_clf.predict(data=x_val)

In [89]:
lgbm_pred_df = pd.DataFrame(lgbm_res.astype(int), columns=['TAG'])
cat_pred_df = pd.DataFrame((cat_res == 'True').astype(int), columns=['TAG'])

In [90]:
p = precision_score(y_val, lgbm_pred_df)
r = recall_score(y_val, lgbm_pred_df)
f1 = f1_score(y_val, lgbm_pred_df)
acc = accuracy_score(y_val, lgbm_pred_df)

In [91]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.9995', '0.6244', '0.7686', '0.7266')

In [92]:
p = precision_score(y_val, cat_pred_df)
r = recall_score(y_val, cat_pred_df)
f1 = f1_score(y_val, cat_pred_df)
acc = accuracy_score(y_val, cat_pred_df)

In [93]:
'%0.4f' % p, '%0.4f' % r, '%0.4f' % f1, '%0.4f' % acc

('0.9995', '0.6221', '0.7669', '0.7250')

In [96]:
final_pred_df = pd.DataFrame()

final_pred_df['lgbm'] = lgbm_pred_df.values.squeeze()
final_pred_df['cat'] = cat_pred_df.values.squeeze()

# final_pred_df['final

final_pred_df['real'] = y_val.values.squeeze()

In [98]:
final_pred_df[(final_pred_df['lgbm'] == 0) & (final_pred_df['cat'] == 0) & (final_pred_df['real'] == 1)]

Unnamed: 0,lgbm,cat,real
2490,0,0,True
3322,0,0,True
4168,0,0,True
4357,0,0,True
4627,0,0,True
...,...,...,...
95199,0,0,True
95737,0,0,True
95769,0,0,True
96132,0,0,True
