# **Library Import**

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

from matplotlib import pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

In [2]:
data=pd.read_csv('FDS_MART(20210804).csv',encoding='cp949')

  interactivity=interactivity, compiler=compiler, result=result)


# **새로운 feature 생성 -p28d_acc_addr_count**

In [3]:
from datetime import datetime, timedelta

data['FRST_RCV_DT_dtime'] = 0

def int_to_dt(x):
    x = str(x)
    return datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8]))

data['FRST_RCV_DT_dtime'] = data['FRST_RCV_DT'].apply(lambda x : int_to_dt(x) )
data['FRST_RCV_DT_dtime']

d0228 = data['FRST_RCV_DT_dtime'].iloc[-1]
print('    ',d0228)
d28 = timedelta(days = 28)
p28days = (pd.date_range(d0228 - d28, periods=28)).strftime('%Y%m%d')
print('    ',p28days[0])
print('    ',p28days[-1])

     2021-02-28 00:00:00
     20210131
     20210227


In [4]:
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=8)

d28 = timedelta(days = 28)
# d14 = timedelta(days = 14)

def p28d_acc_addr_count(x):
    
    p28days = (pd.date_range(x - d28, periods=28)).strftime('%Y%m%d')
    return len(
        data[(data['FRST_RCV_DT_dtime']>=p28days[0])&(data['FRST_RCV_DT_dtime']<=p28days[-1])]
    )

data['p28d_acc_addr_count'] = 0

data['p28d_acc_addr_count'] = data['FRST_RCV_DT_dtime'].parallel_apply(lambda x :p28d_acc_addr_count(x) )
data['p28d_acc_addr_count']

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


0            0
1            0
2            0
3            0
4            0
          ... 
136544    2596
136545    2596
136546    2596
136547    2596
136548    2596
Name: p28d_acc_addr_count, Length: 136549, dtype: int64

In [6]:
raw_data=data

raw_data['cube6_CATH_AMT'] =0

def cube6_transform(x):
    return x**(1/6)

raw_data['cube6_CATH_AMT'] = raw_data['CATH_AMT'].apply(lambda x : cube6_transform(x) )

# target = data['CATH_AMT']**(1/6)

not_na_col_ls = []

for col in raw_data.columns:
    if raw_data[col].isna().any() == False:
        not_na_col_ls.append(col)
        
target_feat = [
    'FDS_REQ',
    'CEMP_REQ',
    'SIU_REQ',
    'SIU_CEMP_REQ',
    'CUST_RANK',
    'RULE_APLY_CNT',
    'RUL_CNT',
    'SIU_CLAS_CD1',
    'SIU_CLAS_CD_TOTAL',
    'BOGUS_ACDNT_CD',
    'PREDICT_MODEL',
    'HYBRID_MODEL',
    'HYBRID_RED',
    'HYBRID_ORANGE',
    'HYBRID_YELLOW',
    'FDS_CATH',
    'CEMP_CATH',
    'SIU_CATH',
    'SIU_CEMP_CATH',
    'CATH_AMT',
    'INSR_MODEL',
    'CONT_MODEL',
    'INJ_MODEL',
    'INJ_POS_MODEL',
    'ALT_GUBUN']

for t_feat in target_feat:
    try:
        not_na_col_ls.remove(t_feat)
    except:
        print(t_feat, 'is not in train_cols')
        pass
    
not_na_col_ls.remove('ACDNT_NO')
not_na_col_ls.remove('POLICY_NO')
not_na_col_ls.remove('FRST_RCV_DT')
not_na_col_ls.remove('cube6_CATH_AMT')

cat_cols = not_na_col_ls

raw_data['cube6_CATH_AMT']

CUST_RANK is not in train_cols
RULE_APLY_CNT is not in train_cols
SIU_CLAS_CD_TOTAL is not in train_cols
INSR_MODEL is not in train_cols
CONT_MODEL is not in train_cols
INJ_MODEL is not in train_cols
INJ_POS_MODEL is not in train_cols


0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
136544    0.0
136545    0.0
136546    0.0
136547    0.0
136548    0.0
Name: cube6_CATH_AMT, Length: 136549, dtype: float64

# **train, test split**

In [33]:
cat_cols = ['ACDNT_TOTAL_TEXT_1', 'ACDNT_CLAS_CD2', 'DRUNK_YN', 'MAKER_G', 'DRIVER_LICNS_STAT', 'ACCO_BI_A_MAX_DIST', 'ACCO_BI_A_AVG_DIST', 'APPD_A_MAX_DIST', 'APPD_OCD_AVG_DIST', 'APPD_A_AVG_DIST', 'CAR_TYPE_NM', 'ACC_ADDR1', 'APPD_OCD_MAX_DIST', 'INS_TYPE_CO_VAL', 'APVC_A_AVG_DIST', 'DRIVER_LICNS_AREA', 'APVC_A_MAX_DIST', 'DISPATCH_TOWING', 'DRIVER_VOCTN_TST_GAP_G', 'POLICE_RPRT_CD', 'LIC_2M', 'CURE_YN', 'OAI_FALT_RATE_G', 'DRIVER_LICNS_YEAR', 'LIC_1M', 'FPC', 'ACC_AREA_RANK', 'LICNS_TYPE_CD', 'ADK_HOUSE_TYPE_DTL', 'APVC_OTH_MAX_DIST', 'DRVER_AGE', 'ACDNT_POL_GAP', 'ACDNT_INSPCT_TEXT', 'APVC_OTH_AVG_DIST', 'LIC_1L', 'DRIVER_RANGE_SPCL_105', 'N_APROF_01', 'N_BPROF_05', 'INSRD_RLATN', 'MNG_AREA_CD', 'N_BPROF_09', 'N_PROF_15', 'N_PROF_14', 'N_APROF_31', 'HO_AP_DIST', 'N_BPROF_10', 'N_APROF_30', 'N_APROF_12', 'N_CPROF_05', 'N_PROF_13', 'FDS_CATH','cube6_CATH_AMT']

In [84]:
raw_data['ACC_AREA_RANK'].fillna('A', inplace=True)

In [85]:
train = raw_data[cat_cols].iloc[:118888]
test = raw_data[cat_cols].iloc[118888:]

In [86]:
train=train[train['FDS_CATH']=='Y']
train.reset_index(drop=True, inplace=True)
train.drop('FDS_CATH', axis=1, inplace=True)
test.drop('FDS_CATH', axis=1, inplace=True)

target='cube6_CATH_AMT'

# **기존 모델**

In [62]:
nunique = train.nunique()
types = train.dtypes

categorical_columns=[]
categorical_dims ={}

for col in train.columns:
    if types[col] == 'object' or nunique[col] <200:
        print(col, train[col].nunique())
        l_enc= LabelEncoder()
        train[col]= train[col].fillna("VV_likely")
        train[col]= l_enc.fit_transform(train[col].values)
        #test[col]= l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train.index, col].mean(), inplace=True)

ACDNT_TOTAL_TEXT_1 4
ACDNT_CLAS_CD2 4
DRUNK_YN 2
MAKER_G 12
DRIVER_LICNS_STAT 3
CAR_TYPE_NM 7
ACC_ADDR1 17
INS_TYPE_CO_VAL 6
DRIVER_LICNS_AREA 18
DISPATCH_TOWING 4
DRIVER_VOCTN_TST_GAP_G 7
POLICE_RPRT_CD 3
LIC_2M 3
CURE_YN 3
OAI_FALT_RATE_G 6
DRIVER_LICNS_YEAR 55
LIC_1M 3
FPC 3
ACC_AREA_RANK 4
LICNS_TYPE_CD 4
ADK_HOUSE_TYPE_DTL 4
DRVER_AGE 74
ACDNT_INSPCT_TEXT 2
LIC_1L 3
DRIVER_RANGE_SPCL_105 7
INSRD_RLATN 5


In [63]:
features =[col for col in train.columns if col not in [target]]
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [64]:
train_x=train[features].values
train_y=train[target].values
train_y=train_y.reshape(-1,1)

In [65]:
#nunique = train.nunique()
#types = train.dtypes

categorical_columns=[]
categorical_dims ={}

for col in test.columns:
    if types[col] == 'object' or nunique[col] <200:
        print(col, test[col].nunique())
        l_enc= LabelEncoder()
        test[col]= test[col].fillna("VV_likely")
        test[col]= l_enc.fit_transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        test.fillna(test.loc[:, col].mean(), inplace=True)
        

test_x=test[features].values
test_y=test[target].values

ACDNT_TOTAL_TEXT_1 4
ACDNT_CLAS_CD2 4
DRUNK_YN 2
MAKER_G 14
DRIVER_LICNS_STAT 3
CAR_TYPE_NM 7
ACC_ADDR1 18
INS_TYPE_CO_VAL 6
DRIVER_LICNS_AREA 18
DISPATCH_TOWING 4
DRIVER_VOCTN_TST_GAP_G 7
POLICE_RPRT_CD 3
LIC_2M 3
CURE_YN 3
OAI_FALT_RATE_G 7
DRIVER_LICNS_YEAR 64
LIC_1M 3
FPC 3
ACC_AREA_RANK 4
LICNS_TYPE_CD 4
ADK_HOUSE_TYPE_DTL 4
DRVER_AGE 75
ACDNT_INSPCT_TEXT 2
LIC_1L 3
DRIVER_RANGE_SPCL_105 9
INSRD_RLATN 5


In [66]:
import optuna
from optuna import Trial, visualization
from sklearn.model_selection import KFold

def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma",  1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params=dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                      lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                      optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                      mask_type=mask_type, n_shared=n_shared,
                      scheduler_params=dict(mode="min", patience=trial.suggest_int("patienceScheduler", low=3, high=10),
                                           min_lr=1e-5, 
                                           factor=0.5),
                      scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                      verbose=0
                      )
    
    kf=KFold(n_splits=10, random_state=42, shuffle=True)
    CV_score_array = []
    for train_index, val_index in kf.split(train_x, train_y):
        X_train, X_val = train_x[train_index], train_x[val_index]
        y_train, y_val = train_y[train_index], train_y[val_index]
        reg=TabNetRegressor(**tabnet_params)
        reg.fit(X_train=X_train, y_train=y_train, 
               eval_set=[(X_train, y_train), (X_val, y_val)],
               patience=trial.suggest_int("patience", low=15, high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
               eval_metric=['mse'])
        CV_score_array.append(reg.best_cost)
    avg=np.mean(CV_score_array)
    return avg

In [67]:
torch.set_num_threads(2)

In [68]:
study_nfs=optuna.create_study(direction='minimize', study_name='TabNet optimization')
study_nfs.optimize(Objective, n_trials=100, timeout=180)


Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_1_mse = 5.89588
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_1_mse = 7.67662
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 86 with best_epoch = 73 and best_val_1_mse = 7.45359
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_1_mse = 8.37413
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_1_mse = 9.59735
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_1_mse = 10.74524
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 86 with best_epoch = 73 and best_val_1_mse = 9.77754
Best weights from best epoch are automatically

[I 2021-12-08 13:52:50,296] Finished trial#0 with value: 8.455019596690317 with parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 1, 'gamma': 1.2, 'n_shared': 1, 'lambda_sparse': 0.00020250441157316552, 'patienceScheduler': 9, 'patience': 20, 'epochs': 86}. Best is trial#0 with value: 8.455019596690317.


In [69]:
TabNet_params = study_nfs.best_params

In [70]:
#TabNet_params ={'mask_type': 'entmax', 'n_da': 64, 'n_steps': 1, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 2.3368609952485577e-06, 'patienceScheduler': 8, 'patience': 18, 'epochs': 35}

In [71]:
print(TabNet_params)

{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 1, 'gamma': 1.2, 'n_shared': 1, 'lambda_sparse': 0.00020250441157316552, 'patienceScheduler': 9, 'patience': 20, 'epochs': 86}


In [72]:
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                     lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                     scheduler_params=dict(mode="min",
                                           patience=TabNet_params['patienceScheduler'],
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
epochs = TabNet_params['epochs']

In [73]:
regressor = TabNetRegressor(**final_params)
regressor.fit(X_train=train_x, y_train=train_y, 
              patience=TabNet_params['patience'], max_epochs=epochs, eval_metric=['mse'])

Device used : cpu
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 96.41173|  0:00:00s
epoch 10 | loss: 6.13846 |  0:00:06s
epoch 20 | loss: 3.88524 |  0:00:12s
epoch 30 | loss: 2.43968 |  0:00:18s
epoch 40 | loss: 1.69755 |  0:00:24s
epoch 50 | loss: 1.33387 |  0:00:29s
epoch 60 | loss: 1.06426 |  0:00:35s
epoch 70 | loss: 0.93891 |  0:00:41s
epoch 80 | loss: 0.82511 |  0:00:46s


In [74]:
PATH='/home/users/adk8400153/AXA_FDS/AUTO_FDS/optuna_tabnet_reg.pth'
torch.save(regressor, PATH)

In [75]:
#test 정답 도출
test_prediction=np.zeros((test.shape[0], 2))
model=torch.load(PATH)
test_prediction = model.predict(test_x)
test_prediction=test_prediction**6

In [111]:
#####################################
reg_path = '/home/users/adk8400153/AXA_FDS/AUTO_FDS/optuna_tabnet_reg.pth' #경로 지정 필요
reg_cols = ['ACDNT_TOTAL_TEXT_1', 'ACDNT_CLAS_CD2', 'DRUNK_YN', 'MAKER_G', 'DRIVER_LICNS_STAT', 'ACCO_BI_A_MAX_DIST', 'ACCO_BI_A_AVG_DIST', 'APPD_A_MAX_DIST', 'APPD_OCD_AVG_DIST', 'APPD_A_AVG_DIST', 'CAR_TYPE_NM', 'ACC_ADDR1', 'APPD_OCD_MAX_DIST', 'INS_TYPE_CO_VAL', 'APVC_A_AVG_DIST', 'DRIVER_LICNS_AREA', 'APVC_A_MAX_DIST', 'DISPATCH_TOWING', 'DRIVER_VOCTN_TST_GAP_G', 'POLICE_RPRT_CD', 'LIC_2M', 'CURE_YN', 'OAI_FALT_RATE_G', 'DRIVER_LICNS_YEAR', 'LIC_1M', 'FPC', 'ACC_AREA_RANK', 'LICNS_TYPE_CD', 'ADK_HOUSE_TYPE_DTL', 'APVC_OTH_MAX_DIST', 'DRVER_AGE', 'ACDNT_POL_GAP', 'ACDNT_INSPCT_TEXT', 'APVC_OTH_AVG_DIST', 'LIC_1L', 'DRIVER_RANGE_SPCL_105', 'N_APROF_01', 'N_BPROF_05', 'INSRD_RLATN', 'MNG_AREA_CD', 'N_BPROF_09', 'N_PROF_15', 'N_PROF_14', 'N_APROF_31', 'HO_AP_DIST', 'N_BPROF_10', 'N_APROF_30', 'N_APROF_12', 'N_CPROF_05', 'N_PROF_13']
reg_model = torch.load(reg_path)

reg_prediction = pd.DataFrame(model.predict(test_x)**6 )

In [112]:
reg_prediction

Unnamed: 0,0
0,5.145318e+06
1,5.796030e+05
2,1.476628e+06
3,1.007487e+06
4,9.278599e+00
...,...
17656,2.308355e+05
17657,2.548579e+06
17658,2.019416e+06
17659,8.655658e+05


# **classifier 예측값 도출**

In [105]:
#5. Fraud Detection by Deep Learning 파일 참조!
cat_prediction=pd.read_csv('cat_tabnet_test_prediction.csv')

In [124]:
cat_prediction

Unnamed: 0,p0,p1
0,0.932708,0.067292
1,0.985572,0.014428
2,0.995532,0.004469
3,0.990771,0.009229
4,0.986624,0.013376
...,...,...
17656,0.993602,0.006398
17657,0.987951,0.012049
17658,0.945076,0.054924
17659,0.984969,0.015031


# **결과값 도출**

In [123]:
expected_cath = pd.DataFrame(cat_prediction['p1']*reg_prediction[0]) #classifier의 probability와 regressor의 amount 곱하기
test_target = raw_data.iloc[118888:]['CATH_AMT']
test_target2 = pd.get_dummies(raw_data.iloc[118888:]['FDS_CATH'])['Y']

score = pd.concat([test_target.reset_index(drop=True),test_target2.reset_index(drop=True),pd.DataFrame(expected_cath).reset_index(drop=True)[0]],axis=1)

######캡 씌우기#######
threshold = 18488000


score.loc[score['CATH_AMT']>threshold, 'CATH_AMT']=threshold
score.sort_values(0, ascending=False, inplace=True)
score=score.reset_index().drop('index', axis=1).copy()

####################################################################################
n_group=10
group=['G01','G02','G03','G04','G05','G06','G07','G08','G09','G10']

bins=round(len(score)/n_group) 
bins_1=len(score)-bins*(n_group-1) 

group_list=[]
for i in range(n_group):
    if i <n_group-1: 
        group_list.append(np.repeat(group[i], bins).tolist())
    else: 
        group_list.append(np.repeat(group[i], bins_1).tolist())
    
import itertools

group_list=list(itertools.chain(*group_list))
score['group']=pd.Series(group_list)

function_list=['count','sum','mean']
table2=score.groupby(['group']).agg(function_list).reset_index()

table2

Unnamed: 0_level_0,group,CATH_AMT,CATH_AMT,CATH_AMT,Y,Y,Y,0,0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,1766,355565479,201339.455832,1766,84,0.047565,1766,486818600.0,275661.704498
1,G02,1766,96679504,54744.906002,1766,41,0.023216,1766,98043640.0,55517.35029
2,G03,1766,62086031,35156.302945,1766,29,0.016421,1766,59493060.0,33688.031416
3,G04,1766,39226215,22211.899773,1766,16,0.00906,1766,40222900.0,22776.275384
4,G05,1766,37805705,21407.533975,1766,13,0.007361,1766,28756680.0,16283.511633
5,G06,1766,66901189,37882.892978,1766,15,0.008494,1766,20639950.0,11687.398499
6,G07,1766,31521077,17848.854473,1766,17,0.009626,1766,14640780.0,8290.363481
7,G08,1766,3849000,2179.501699,1766,5,0.002831,1766,10110780.0,5725.244564
8,G09,1766,32815613,18581.887316,1766,7,0.003964,1766,5978285.0,3385.212166
9,G10,1767,12492710,7070.011319,1767,2,0.001132,1767,2277343.0,1288.819122
