In [2]:
# load modules and set configurations
import numpy as np
import pandas as pd

import os, copy, random, pickle, gc
from itertools import product
from tqdm import tqdm

pd.set_option('display.max_columns', None)

import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42

# 2. ML and anmoaly detection algorithms

In [6]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.covariance import EllipticEnvelope

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from functools import partial

with open(f'data-dict-for_ml.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [8]:
ver = 1 # 1, 2, 3
low_esi = 1 # 0, 1, 'all'

data = data_dict[ver][low_esi]

In [9]:
# initialization
mdl = 'xgb' # xgb cat lgb ran lr lof, mahalanobis iso
contamination = data['trn']['y'].sum()/len(data['trn']['y'])
if mdl == 'lof':
    model = LocalOutlierFactor(n_neighbors=3, contamination=contamination, novelty=True)
elif mdl == 'mahalanobis':
    model = EllipticEnvelope(contamination=contamination, random_state=SEED)
elif mdl == 'iso':
    model = IsolationForest(contamination=contamination, random_state=SEED)

elif mdl == 'xgb':
    model = XGBClassifier(random_state=SEED)
elif mdl == 'cat':
    model = CatBoostClassifier(random_state=SEED, verbose=False)
elif mdl == 'lgb':
    model = LGBMClassifier(random_state=SEED)
elif mdl == 'ran':
    model = RandomForestClassifier(random_state=SEED)
elif mdl == 'lr':
    model = LogisticRegression(random_state=SEED)

In [10]:
# training
if mdl in ['mahalanobis', 'iso', 'lof']:
    model.fit(data['trn']['X'])
else: 
    model.fit(data['trn']['X'], data['trn']['y'])

In [12]:
# evaluation data
eval_split = 'val'#tst val
if mdl == 'mahalanobis':
    score = model.mahalanobis(data[eval_split]['X'])

elif mdl in ['iso', 'lof']:
    score = -model.score_samples(data[eval_split]['X'])

else:
    score = model.predict_proba(data[eval_split]['X'])[:, 1]

eval_data = pd.DataFrame()
eval_data['id'] = data[eval_split]['ids']
eval_data['true'] = data[eval_split]['y']
eval_data['score'] = score
eval_data['n_seq'] = data[eval_split]['n_seq']

eval_data.to_csv(f"eval_data-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv", index=False)

In [None]:
def conf_mat(true, pred):
    tp = ((pred == 1) & (true == 1)).sum()
    fp = ((pred == 1) & (true == 0)).sum()
    fn = ((pred == 0) & (true == 1)).sum()
    tn = ((pred == 0) & (true == 0)).sum()
    return tp, fp, fn, tn

eval_split = 'val' # val tst
for mdl in ['xgb', 'cat', 'lgb', 'ran', 'lr', 'mahalanobis', 'lof', 'iso']: 
    eval_result = []
    eval_data = pd.read_csv(f'eval_data-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv')
    scores = eval_data['score'].unique()

    for s in tqdm(scores):
        eval_data['pred'] = np.where(eval_data['score']>=s, 1, 0)
        tmp = eval_data.groupby('id').agg({'true': lambda x: x.values[0], 'pred': 'max'}).reset_index()
        tp, fp, fn, tn = conf_mat(tmp['true'], tmp['pred'])

        eval_result.append([s, tp/(tp+fn), tp/(tp+fp), 2*tp/(fp+2*tp+fn)])

    eval_result = pd.DataFrame(eval_result, columns=['score', 'rec', 'prec', 'f1'])
    eval_result.to_csv(f'eval_result-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv', index=False)

100%|██████████| 2652/2652 [00:29<00:00, 90.81it/s]


In [None]:
def conf_mat(true, pred):
    tp = ((pred == 1) & (true == 1)).sum()
    fp = ((pred == 1) & (true == 0)).sum()
    fn = ((pred == 0) & (true == 1)).sum()
    tn = ((pred == 0) & (true == 0)).sum()
    return tp, fp, fn, tn

eval_split = 'tst' # val tst
for mdl in ['xgb', 'cat', 'lgb', 'ran', 'lr', 'mahalanobis', 'lof', 'iso']: 
    eval_result = []
    eval_data = pd.read_csv(f'eval_data-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv')
    scores = eval_data['score'].unique()

    for s in tqdm(scores):
        eval_data['pred'] = np.where(eval_data['score']>=s, 1, 0)
        tmp = eval_data.groupby('id').agg({'true': lambda x: x.values[0], 'pred': 'max'}).reset_index()
        tp, fp, fn, tn = conf_mat(tmp['true'], tmp['pred'])

        eval_result.append([s, tp/(tp+fn), tp/(tp+fp), 2*tp/(fp+2*tp+fn)])

    eval_result = pd.DataFrame(eval_result, columns=['score', 'rec', 'prec', 'f1'])
    eval_result.to_csv(f'eval_result-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv', index=False)

In [None]:
prec_cf = 0.7
for mdl in ['xgb', 'cat', 'lgb', 'ran', 'lr', 'mahalanobis', 'lof', 'iso']: #['xgb', 'cat', 'lgb', 'ran', 'lr', 'mahalanobis', 'lof', 'iso']
    eval_result = pd.read_csv(f'eval_result-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv')
    print(f"{mdl} recall@prec{prec_cf}: {eval_result.query('prec>=@prec_cf')['rec'].max()}")

xgb recall@prec0.7: 0.8301282051282052


In [None]:
def conf_mat(true, pred):
    tp = ((pred == 1) & (true == 1)).sum()
    fp = ((pred == 1) & (true == 0)).sum()
    fn = ((pred == 0) & (true == 1)).sum()
    tn = ((pred == 0) & (true == 0)).sum()
    return tp, fp, fn, tn
eval_split = 'tst' # val tst
ver = 1
low_esi = 1
for mdl in ['lstm_ae_layer']:
    eval_result = []
    eval_data = pd.read_csv(f'eval_data-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv')
    scores = eval_data['loss'].unique()
    for s in tqdm(scores):
        eval_data['pred'] = np.where(eval_data['loss']>=s, 1, 0)
        tmp = eval_data.groupby('id').agg({'true': lambda x: x.values[0], 'pred': 'max'}).reset_index()
        tp, fp, fn, tn = conf_mat(tmp['true'], tmp['pred'])
        eval_result.append([s, tp/(tp+fn), tp/(tp+fp), 2*tp/(fp+2*tp+fn)])
    eval_result = pd.DataFrame(eval_result, columns=['score', 'rec', 'prec', 'f1'])
    eval_result.to_csv(f'eval_result-low_esi{ver}-{low_esi}-{mdl}-{eval_split}.csv', index=False)