In [2]:
import numpy as np, os
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns

import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings(action='ignore')

#한글설정
import matplotlib.font_manager as fm

font_dirs = ['/usr/share/fonts/truetype/nanum', ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    fm.fontManager.addfont(font_file)
    
# 한글 출력을 위해서 폰트 옵션을 설정합니다.
# "axes.unicode_minus" : 마이너스가 깨질 것을 방지

sns.set(font="NanumBarunGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

# Method1
일관성, 방향성 고려한 feature 탐지 <br>
**goal : 사망/생존 마커 탐지** <br>
- Score 계산 식

<div align=center> $Score = ReLU(FI^{inverse})*sign(FI^{reverse})$ </div>

- $FI^{reverse}$
  1. X_test의 각 feature의 값(4068개)을 모두 1로 바꾸어 예측
  2. 예측값 리스트의 평균값 계산
  3. 각 feature의 평균값과 기존 baseline의 평균값을 계산 (= x)
    - x = mean(pred) - mean(y_test)
  4. x > 0 이면 1, x <= 0 이면 -1 으로 {1, -1}으로 부호 결정

# 필요한 데이터 로드

In [3]:
fi = pd.read_csv('./data/df_all_best_type.csv')
COLS = list(pd.read_csv('/project/LSH/total_data_7727.csv')['ITEMID'].sort_values().unique())
model = tf.keras.models.load_model('/project/LSH/model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

# FI에 ReLU 적용

In [6]:
#relu 함수 정의
def relu(x):
    if x > 0:
        return x
    else:
        return 0

In [14]:
tmp = fi.copy()
tmp['diff'] = tmp['diff'].apply(lambda x:relu(x))
relu_fi = tmp[['feature', 'diff']].sort_values('diff', ascending=False)

In [15]:
relu_fi

Unnamed: 0,feature,diff
0,409606211,0.059180
1,74606211,0.036668
2,67434504,0.032467
3,54817525,0.029130
4,10019055302,0.027498
...,...,...
3469,78005303,0.000000
3470,17317001801,0.000000
3471,597003212,0.000000
3472,51435,0.000000


# all value 0 to 1 - mean(predict_list)

In [16]:
import random    
seed_num = 42
random.seed(seed_num)
path = '/project/LSH/'
X = np.load(path + 'x_(7727,10,4068).npy')
y = np.load(path + 'y_(7727,1).npy')

idx = list(range(len(X)))
random.shuffle(idx)


i = round(X.shape[0]*0.8)
X_train, y_train = X[idx[:i],:,:], y[idx[:i]]
X_test, y_test = X[idx[i:],:,:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 10, 4068), (6182,), (1545, 10, 4068), (1545,))

In [18]:
base_pred = np.mean(model.predict(X_test))
base_pred

0.63693285

In [21]:
result = []
for i in tqdm(range(X_test.shape[2])):
    save_cols = X_test[:,:,i].copy()
    X_test[:,:,i] = 1
    pred = model.predict(X_test)
    result.append({'feature' : str(COLS[i]), 'value' : np.mean(pred)})
    #값 복원
    X_test[:,:,i] = save_cols

  0%|          | 0/4068 [00:00<?, ?it/s]

In [38]:
sign_fi = pd.DataFrame(result)

In [32]:
sign_fi['baseline'] = base_pred
sign_fi['diff'] = sign_fi['score'] - sign_fi['baseline']

In [34]:
#부호 판별하여 1, -1로 변경하는 함수
def only_sign(x):
    if x >= 0:
        return 1
    else:
        return -1

In [35]:
sign_fi['sign'] = sign_fi['diff'].apply(lambda x:only_sign(x))
sign_fi = sign_fi.sort_values('feature')

In [37]:
sign_fi['sign'].value_counts()

-1    4068
Name: sign, dtype: int64

In [25]:
result = sign_fi.feature.to_frame()
result['score'] = relu_fi['diff'] * sign_fi['value']
result = result.sort_values('score', ascending=False)
result

Unnamed: 0,feature,score
0,0,0.059180
1,50803,0.036668
2,50804,0.032467
3,50805,0.029130
4,50806,0.027498
...,...,...
3996,67457019015,0.000000
3542,58160082011,0.000000
3543,58160082611,0.000000
3544,58160083501,0.000000


In [26]:
# result.to_csv('./data/method1_score.csv', index=False)