In [1]:
# for "2. Data Loading"
import pandas as pd
pd.options.display.float_format = '{:.5f}'.format
# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# # for 4
# from xgboost import XGBClassifier

# for "5. Modeling with CatBoostRegressor"
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
import optuna
from optuna import Trial, visualization
import time
import joblib
from optuna.samplers import TPESampler

In [3]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)  # type: ignore
    # torch.backends.cudnn.deterministic = True  # type: ignore
    # torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything()

# Data Load

In [4]:
raw_data = pd.read_csv('FDS_MART(20210804).csv',encoding='cp949')
raw_data


Columns (623) have mixed types.Specify dtype option on import or set low_memory=False.



Unnamed: 0,ACDNT_NO,POLICY_NO,FRST_RCV_DT,ACDNT_GAP,ACDNT_7DAY_YN,NIGHT_ACDNT_YN,DAYWEEK_CD,MNG_AREA_TYPE_CD,MNG_AREA_CD,ACDNT_CLAS_CD1,...,FDS_CATH,CEMP_CATH,SIU_CATH,SIU_CEMP_CATH,CATH_AMT,INSR_MODEL,CONT_MODEL,INJ_MODEL,INJ_POS_MODEL,ALT_GUBUN
0,1800000003,A17121107765,20180101,76.90000,N,N,1,G3,5310,30,...,N,N,N,N,0,10.00000,4.00000,,,일반건
1,1800000004,A17040344397,20180101,7.76700,N,Y,2,G3,44990,20,...,N,N,N,N,0,10.00000,2.00000,,,고위험
2,1800000013,A17121028936,20180101,147.85000,N,Y,2,G3,37124,20,...,N,N,N,N,0,1.00000,1.00000,,,고위험
3,1800000020,A17020116294,20180101,5.73300,N,N,2,G3,54084,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
4,1800000024,A17080728803,20180101,12.83300,N,N,2,G3,28209,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136544,2100033918,A21010017808,20210228,99.70000,N,N,1,G3,13245,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
136545,2100033936,A20090799754,20210228,53.28300,N,N,1,G3,37829,10,...,N,N,N,N,0,10.00000,10.00000,,,일반건
136546,2100033938,A21010032320,20210228,12.01700,N,N,1,G3,26153,10,...,N,N,N,N,0,4.00000,8.00000,,,일반건
136547,2100033946,A20040344584,20210228,95.46700,N,N,1,G3,1143,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건


In [5]:
raw_data

Unnamed: 0,ACDNT_NO,POLICY_NO,FRST_RCV_DT,ACDNT_GAP,ACDNT_7DAY_YN,NIGHT_ACDNT_YN,DAYWEEK_CD,MNG_AREA_TYPE_CD,MNG_AREA_CD,ACDNT_CLAS_CD1,...,FDS_CATH,CEMP_CATH,SIU_CATH,SIU_CEMP_CATH,CATH_AMT,INSR_MODEL,CONT_MODEL,INJ_MODEL,INJ_POS_MODEL,ALT_GUBUN
0,1800000003,A17121107765,20180101,76.90000,N,N,1,G3,5310,30,...,N,N,N,N,0,10.00000,4.00000,,,일반건
1,1800000004,A17040344397,20180101,7.76700,N,Y,2,G3,44990,20,...,N,N,N,N,0,10.00000,2.00000,,,고위험
2,1800000013,A17121028936,20180101,147.85000,N,Y,2,G3,37124,20,...,N,N,N,N,0,1.00000,1.00000,,,고위험
3,1800000020,A17020116294,20180101,5.73300,N,N,2,G3,54084,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
4,1800000024,A17080728803,20180101,12.83300,N,N,2,G3,28209,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136544,2100033918,A21010017808,20210228,99.70000,N,N,1,G3,13245,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건
136545,2100033936,A20090799754,20210228,53.28300,N,N,1,G3,37829,10,...,N,N,N,N,0,10.00000,10.00000,,,일반건
136546,2100033938,A21010032320,20210228,12.01700,N,N,1,G3,26153,10,...,N,N,N,N,0,4.00000,8.00000,,,일반건
136547,2100033946,A20040344584,20210228,95.46700,N,N,1,G3,1143,20,...,N,N,N,N,0,10.00000,10.00000,,,일반건


In [6]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f}MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
            else:
                pass
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f}MB')
    print(f'Decreased by {100*((start_mem - end_mem)/start_mem):.1f}%')
    
    return df

## Data Mem Reduce - 1minute 

In [7]:
%%time
raw_data = reduce_mem_usage(raw_data)

Memory usage of dataframe is 666.74MB
Memory usage after optimization is: 152.53MB
Decreased by 77.1%
CPU times: user 26.9 s, sys: 25.9 s, total: 52.8 s
Wall time: 52.9 s


# New feature: previous 28d count

In [8]:
from datetime import datetime, timedelta

raw_data['FRST_RCV_DT_dtime'] = 0

def int_to_dt(x):
    x = str(x)
    return datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8]))

raw_data['FRST_RCV_DT_dtime'] = raw_data['FRST_RCV_DT'].apply(lambda x : int_to_dt(x) )

d0228 = raw_data['FRST_RCV_DT_dtime'].iloc[-1]
print('    ',d0228)
d28 = timedelta(days = 28)
p28days = (pd.date_range(d0228 - d28, periods=28)).strftime('%Y%m%d')
print('    ',p28days[0])
print('    ',p28days[-1])

     2021-02-28 00:00:00
     20210131
     20210227


In [9]:
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=8)

d28 = timedelta(days = 28)
# d14 = timedelta(days = 14)

def p28d_acc_addr_count(x):
    
    p28days = (pd.date_range(x - d28, periods=28)).strftime('%Y%m%d')
    return len(
        raw_data[(raw_data['FRST_RCV_DT_dtime']>=p28days[0])&(raw_data['FRST_RCV_DT_dtime']<=p28days[-1])]
    )

raw_data['p28d_acc_addr_count'] = 0

raw_data['p28d_acc_addr_count'] = raw_data['FRST_RCV_DT_dtime'].parallel_apply(lambda x :p28d_acc_addr_count(x) )
raw_data['p28d_acc_addr_count']

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


0            0
1            0
2            0
3            0
4            0
          ... 
136544    2596
136545    2596
136546    2596
136547    2596
136548    2596
Name: p28d_acc_addr_count, Length: 136549, dtype: int64

## Target 변형

In [10]:
raw_data['cube6_CATH_AMT'] =0

def cube6_transform(x):
    return x**(1/6)

raw_data['cube6_CATH_AMT'] = raw_data['CATH_AMT'].apply(lambda x : cube6_transform(x) )

# target = data['CATH_AMT']**(1/6)

not_na_col_ls = []

for col in raw_data.columns:
    if raw_data[col].isna().any() == False:
        not_na_col_ls.append(col)
        
target_feat = [
    'FDS_REQ',
    'CEMP_REQ',
    'SIU_REQ',
    'SIU_CEMP_REQ',
    'CUST_RANK',
    'RULE_APLY_CNT',
    'RUL_CNT',
    'SIU_CLAS_CD1',
    'SIU_CLAS_CD_TOTAL',
    'BOGUS_ACDNT_CD',
    'PREDICT_MODEL',
    'HYBRID_MODEL',
    'HYBRID_RED',
    'HYBRID_ORANGE',
    'HYBRID_YELLOW',
    'FDS_CATH',
    'CEMP_CATH',
    'SIU_CATH',
    'SIU_CEMP_CATH',
    'CATH_AMT',
    'INSR_MODEL',
    'CONT_MODEL',
    'INJ_MODEL',
    'INJ_POS_MODEL',
    'ALT_GUBUN']

for t_feat in target_feat:
    try:
        not_na_col_ls.remove(t_feat)
    except:
        print(t_feat, 'is not in train_cols')
        pass
    
not_na_col_ls.remove('ACDNT_NO')
not_na_col_ls.remove('POLICY_NO')
not_na_col_ls.remove('FRST_RCV_DT')
not_na_col_ls.remove('cube6_CATH_AMT')

cat_cols = not_na_col_ls

raw_data['cube6_CATH_AMT']

CUST_RANK is not in train_cols
RULE_APLY_CNT is not in train_cols
SIU_CLAS_CD_TOTAL is not in train_cols
INSR_MODEL is not in train_cols
CONT_MODEL is not in train_cols
INJ_MODEL is not in train_cols
INJ_POS_MODEL is not in train_cols


0        0.00000
1        0.00000
2        0.00000
3        0.00000
4        0.00000
           ...  
136544   0.00000
136545   0.00000
136546   0.00000
136547   0.00000
136548   0.00000
Name: cube6_CATH_AMT, Length: 136549, dtype: float64

# CatB

In [19]:
from catboost import CatBoostClassifier, Pool

## Train, Valid, Test Split - multi SKF

In [20]:
past_6m_data = raw_data.iloc[:118888]
past_6m_data = past_6m_data[past_6m_data['FDS_CATH']=='Y']

recent_6m_data = raw_data.iloc[118888:]
recent_6m_data = recent_6m_data[recent_6m_data['FDS_CATH']=='Y']

In [21]:
data= raw_data[raw_data['FDS_CATH']=='Y']

In [22]:
# Train & Valid

past_6m_data

Unnamed: 0,ACDNT_NO,POLICY_NO,FRST_RCV_DT,ACDNT_GAP,ACDNT_7DAY_YN,NIGHT_ACDNT_YN,DAYWEEK_CD,MNG_AREA_TYPE_CD,MNG_AREA_CD,ACDNT_CLAS_CD1,...,SIU_CEMP_CATH,CATH_AMT,INSR_MODEL,CONT_MODEL,INJ_MODEL,INJ_POS_MODEL,ALT_GUBUN,FRST_RCV_DT_dtime,p28d_acc_addr_count,cube6_CATH_AMT
21,1800000094,A17100881962,20180101,32.56700,N,N,2,G3,28796,20,...,N,300000,10.00000,7.00000,,,일반건,2018-01-01,0,8.18189
32,1800000155,A17020065146,20180101,13.85000,N,N,2,G3,52924,20,...,Y,455600,10.00000,9.00000,,,일반건,2018-01-01,0,8.77197
55,1800000284,A17121070236,20180101,12.56700,N,N,2,G3,4398,20,...,N,3467420,2.00000,2.00000,,,고위험,2018-01-01,0,12.30272
58,1800000297,A17121070236,20180101,13.36700,N,N,2,G3,4398,20,...,N,1000000,10.00000,10.00000,,,일반건,2018-01-01,0,10.00000
101,1800000545,A17111049411,20180102,2377.30005,N,N,1,G3,48498,30,...,Y,814055,9.00000,2.00000,,,고위험,2018-01-02,86,9.66293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118304,2000146245,A19090772512,20200826,44694.50000,N,N,1,G3,13367,30,...,N,808115,8.00000,7.00000,,,일반건,2020-08-26,3041,9.65115
118447,2000147126,A20020118629,20200827,445.11700,N,N,5,G2,13007,30,...,Y,5110000,1.00000,1.00000,,,고위험,2020-08-27,3030,13.12412
118467,2000147247,A20020080360,20200827,3301.35010,N,N,3,G3,5827,30,...,Y,9539418,10.00000,10.00000,,,일반건,2020-08-27,3030,14.56309
118678,2000148596,A20040246503,20200830,11.91700,N,N,1,G3,35019,20,...,N,1343540,10.00000,10.00000,,,일반건,2020-08-30,3006,10.50449


In [23]:
cat_cols = ['ACDNT_TOTAL_TEXT_1', 'ACDNT_CLAS_CD2', 'DRUNK_YN', 'MAKER_G', 'DRIVER_LICNS_STAT', 'ACCO_BI_A_MAX_DIST', 'ACCO_BI_A_AVG_DIST', 'APPD_A_MAX_DIST', 'APPD_OCD_AVG_DIST', 'APPD_A_AVG_DIST', 'CAR_TYPE_NM', 'ACC_ADDR1', 'APPD_OCD_MAX_DIST', 'INS_TYPE_CO_VAL', 'APVC_A_AVG_DIST', 'DRIVER_LICNS_AREA', 'APVC_A_MAX_DIST', 'DISPATCH_TOWING', 'DRIVER_VOCTN_TST_GAP_G', 'POLICE_RPRT_CD', 'LIC_2M', 'CURE_YN', 'OAI_FALT_RATE_G', 'DRIVER_LICNS_YEAR', 'LIC_1M', 'FPC', 'ACC_AREA_RANK', 'LICNS_TYPE_CD', 'ADK_HOUSE_TYPE_DTL', 'APVC_OTH_MAX_DIST', 'DRVER_AGE', 'ACDNT_POL_GAP', 'ACDNT_INSPCT_TEXT', 'APVC_OTH_AVG_DIST', 'LIC_1L', 'DRIVER_RANGE_SPCL_105', 'N_APROF_01', 'N_BPROF_05', 'INSRD_RLATN', 'MNG_AREA_CD', 'N_BPROF_09', 'N_PROF_15', 'N_PROF_14', 'N_APROF_31', 'HO_AP_DIST', 'N_BPROF_10', 'N_APROF_30', 'N_APROF_12', 'N_CPROF_05', 'N_PROF_13']
cat_cols

['ACDNT_TOTAL_TEXT_1',
 'ACDNT_CLAS_CD2',
 'DRUNK_YN',
 'MAKER_G',
 'DRIVER_LICNS_STAT',
 'ACCO_BI_A_MAX_DIST',
 'ACCO_BI_A_AVG_DIST',
 'APPD_A_MAX_DIST',
 'APPD_OCD_AVG_DIST',
 'APPD_A_AVG_DIST',
 'CAR_TYPE_NM',
 'ACC_ADDR1',
 'APPD_OCD_MAX_DIST',
 'INS_TYPE_CO_VAL',
 'APVC_A_AVG_DIST',
 'DRIVER_LICNS_AREA',
 'APVC_A_MAX_DIST',
 'DISPATCH_TOWING',
 'DRIVER_VOCTN_TST_GAP_G',
 'POLICE_RPRT_CD',
 'LIC_2M',
 'CURE_YN',
 'OAI_FALT_RATE_G',
 'DRIVER_LICNS_YEAR',
 'LIC_1M',
 'FPC',
 'ACC_AREA_RANK',
 'LICNS_TYPE_CD',
 'ADK_HOUSE_TYPE_DTL',
 'APVC_OTH_MAX_DIST',
 'DRVER_AGE',
 'ACDNT_POL_GAP',
 'ACDNT_INSPCT_TEXT',
 'APVC_OTH_AVG_DIST',
 'LIC_1L',
 'DRIVER_RANGE_SPCL_105',
 'N_APROF_01',
 'N_BPROF_05',
 'INSRD_RLATN',
 'MNG_AREA_CD',
 'N_BPROF_09',
 'N_PROF_15',
 'N_PROF_14',
 'N_APROF_31',
 'HO_AP_DIST',
 'N_BPROF_10',
 'N_APROF_30',
 'N_APROF_12',
 'N_CPROF_05',
 'N_PROF_13']

In [24]:
multi_label = pd.get_dummies(past_6m_data[['ACDNT_TOTAL_TEXT_1','ACDNT_CLAS_CD2','CAR_TYPE_NM','DRUNK_YN']])
multi_label

Unnamed: 0,ACDNT_TOTAL_TEXT_1_NN,ACDNT_TOTAL_TEXT_1_NY,ACDNT_TOTAL_TEXT_1_YN,ACDNT_TOTAL_TEXT_1_YY,ACDNT_CLAS_CD2_G1,ACDNT_CLAS_CD2_G2,ACDNT_CLAS_CD2_G3,ACDNT_CLAS_CD2_G4,CAR_TYPE_NM_CO,CAR_TYPE_NM_ETC,CAR_TYPE_NM_LA,CAR_TYPE_NM_ME,CAR_TYPE_NM_PO,CAR_TYPE_NM_RV,CAR_TYPE_NM_SA,CAR_TYPE_NM_SB,DRUNK_YN_N,DRUNK_YN_Y
21,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
32,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
55,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1
58,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
101,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118304,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0
118447,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
118467,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
118678,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0


In [25]:
%%time
# from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

n_splits = 10

skf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
# MultilabelStratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

i =0
for train_idx, test_idx in (skf.split(past_6m_data[cat_cols],multi_label)):
  
  globals()['train_feat_'+str(i+1)] = past_6m_data[cat_cols].iloc[train_idx]
  globals()['train_label_'+str(i+1)] = past_6m_data['cube6_CATH_AMT'].iloc[train_idx]

  globals()['valid_feat_'+str(i+1)] = past_6m_data[cat_cols].iloc[test_idx]
  globals()['valid_label_'+str(i+1)] = past_6m_data['cube6_CATH_AMT'].iloc[test_idx]
  i=i+1


Pass shuffle=True, random_state=42 as keyword args. From version 0.25 passing these as positional arguments will result in an error



CPU times: user 162 ms, sys: 1.29 ms, total: 163 ms
Wall time: 161 ms


In [26]:
for i in range(n_splits):
  print( len(globals()['train_feat_'+str(i+1)]), len(globals()['train_label_'+str(i+1)]),
        len(globals()['valid_feat_'+str(i+1)]), len(globals()['valid_label_'+str(i+1)]) )

3230 3230 359 359
3230 3230 359 359
3230 3230 359 359
3230 3230 359 359
3230 3230 359 359
3230 3230 359 359
3231 3231 358 358
3230 3230 359 359
3230 3230 359 359
3230 3230 359 359


# Fitting 2 : 50f fit

In [None]:
%%time

version='v_feat3c_Hard_50f_Reg_F1_2' # v2 : gamma = 3

from catboost import CatBoostRegressor, Pool

# asd = range(10)
for i in range(1):
#     i = i+1
    print('='*20, str(i+1), '='*20) 
    

    cat_feat_ls = []
    for item in list(data[cat_cols].select_dtypes(include='category').columns):
        cat_feat_ls.append(item)
    
    cat_features = cat_feat_ls
    
    train_data = globals()[f'train_feat_{i+1}']
    train_labels = globals()[f'train_label_{i+1}']
    
    valid_data = globals()[f'valid_feat_{i+1}']
    valid_labels = globals()[f'valid_label_{i+1}']


    globals()[f'CatB_model_{i+1}'] = CatBoostRegressor(iterations=80000,
                                                        depth=12,
                                                        thread_count=1,
                                                        learning_rate=4*1e-3,
                                                        max_ctr_complexity=15,
                                                        loss_function='MAE',
                                                        eval_metric='MAE',
                                                        early_stopping_rounds=2000,
                                                        use_best_model=True,
#                                                         has_time=True,
#                                                         metric_period=50,
                                                        
                                                        verbose=1000)

    # train the model
    globals()[f'CatB_model_{i+1}'].fit(train_data, train_labels, cat_features, eval_set = (valid_data,valid_labels))

    
    
    globals()[f'CatB_model_{i+1}'].save_model(f'CatB_C_Penalty_{version}_fold_{i+1}')  
    print(str(i+1),'fold model saved')

0:	learn: 1.9408602	test: 1.8323054	best: 1.8323054 (0)	total: 534ms	remaining: 11h 52m 18s


In [27]:
version = 'v2_0_2_3_2_feat35' # 분류 모델 버전 명 기입 

model = CatBoostClassifier()  

cat_cols = ['ACDNT_TOTAL_TEXT_1','ACDNT_CLAS_CD2','ACDNT_GAP','HO_AP_DIST','N_PROF_13','OAI_FALT_RATE_G','N_PROF_25','N_PROF_14','CAR_TYPE_NM','MAKER_G','DRIVER_LICNS_AREA','DISPATCH_TOWING',
 'N_PROF_08','ACCO_BI_A_MAX_DIST','N_PROF_18','LICNS_TYPE_CD','ACCO_A_AVG_DIST','ACC_ADDR1','ACCO_BI_OTH_MAX_DIST','DRIVER_VOCTN_TST_GAP_G','LIC_2M','LIC_1M','DRIVER_LICNS_STAT','ACDNT_INSPCT_TEXT',
 'POLICE_RPRT_CD','OCD_PRSN_CNT','DRUNK_YN','NON_FALT_CLAS_CD','OWNER_YN','INDM_EXPC_CD','LIC_1L','CURE_YN','INSR_TYPE_NM','LIC_2MOTO','p28d_acc_addr_count'] # 분류 모델에 사용한 변수 명 입력

fold_to_learn = [0,6]

for i in fold_to_learn:
    print(i+1)
    try:
        globals()[f'CatB_model_{i+1}'] = model.load_model(f'CatB_C_Penalty_{version}_fold_{i+1}')  
    except:
        print(i+1, 'fold loading error')
        
########################################

for i in fold_to_learn:# range(n_splits):
    
    print('='*20, str(i+1), '='*20) 
    test_data = raw_data.iloc[118888:][cat_cols]
    globals()[f'CatB_preds_proba_{i+1}'] = globals()[f'CatB_model_{i+1}'].predict_proba(test_data)
    
#########################################

CatB_preds_proba = np.zeros((17661 ,2))

good_load = fold_to_learn

for i in good_load:
    CatB_preds_proba = CatB_preds_proba + globals()['CatB_preds_proba_'+str(i+1)]


CatB_preds_proba = CatB_preds_proba/len(good_load)

bl_proba = CatB_preds_proba

###############################################################################################
###############################################################################################

version = 'v_feat3c_Hard_50f_Reg_F1_2' # 회귀 모델 버전 입력 
from catboost import CatBoostRegressor, Pool
model = CatBoostRegressor()  

fold_to_learn = [0]
CatB_preds_proba = np.zeros((17661,))

_cat_cols_ = ['ACDNT_TOTAL_TEXT_1', 'ACDNT_CLAS_CD2', 'DRUNK_YN', 'MAKER_G', 'DRIVER_LICNS_STAT', 'ACCO_BI_A_MAX_DIST', 'ACCO_BI_A_AVG_DIST', 'APPD_A_MAX_DIST', 'APPD_OCD_AVG_DIST', 'APPD_A_AVG_DIST', 'CAR_TYPE_NM', 'ACC_ADDR1', 'APPD_OCD_MAX_DIST', 'INS_TYPE_CO_VAL', 'APVC_A_AVG_DIST', 'DRIVER_LICNS_AREA', 'APVC_A_MAX_DIST', 'DISPATCH_TOWING', 'DRIVER_VOCTN_TST_GAP_G', 'POLICE_RPRT_CD', 'LIC_2M', 'CURE_YN', 'OAI_FALT_RATE_G', 'DRIVER_LICNS_YEAR', 'LIC_1M', 'FPC', 'ACC_AREA_RANK', 'LICNS_TYPE_CD', 'ADK_HOUSE_TYPE_DTL', 'APVC_OTH_MAX_DIST', 'DRVER_AGE', 'ACDNT_POL_GAP', 'ACDNT_INSPCT_TEXT', 'APVC_OTH_AVG_DIST', 'LIC_1L', 'DRIVER_RANGE_SPCL_105', 'N_APROF_01', 'N_BPROF_05', 'INSRD_RLATN', 'MNG_AREA_CD', 'N_BPROF_09', 'N_PROF_15', 'N_PROF_14', 'N_APROF_31', 'HO_AP_DIST', 'N_BPROF_10', 'N_APROF_30', 'N_APROF_12', 'N_CPROF_05', 'N_PROF_13'] # 회귀 모델에 사용한 변수 명 입력


for i in fold_to_learn:
    print(i+1)

    globals()[f'CatB_model_{i+1}'] = model.load_model(f'CatB_C_Penalty_{version}_fold_{i+1}')  
    print('='*20, str(i+1), '='*20) 
    test_data = raw_data.iloc[118888:][_cat_cols_]
    test_data['ACC_AREA_RANK'].fillna('A', inplace=True)
    globals()[f'CatB_preds_proba_{i+1}'] = globals()[f'CatB_model_{i+1}'].predict(test_data)
    CatB_preds_proba = CatB_preds_proba + globals()['CatB_preds_proba_'+str(i+1)]
        
        
CatB_preds_proba = CatB_preds_proba/len(fold_to_learn)

reg_12 = CatB_preds_proba

1
7
1


In [32]:
score

Unnamed: 0,CATH_AMT,Y,0,group
0,0,0,3018125.79877,G01
1,4000000,1,2716045.96006,G01
2,0,0,2628426.10290,G01
3,4000000,1,2595746.35666,G01
4,0,0,2569011.30267,G01
...,...,...,...,...
17656,0,0,9001.62784,G10
17657,0,0,9000.23766,G10
17658,0,0,8966.56052,G10
17659,0,0,8793.24790,G10


In [33]:
expected_cath = pd.DataFrame(bl_proba[:,1] * reg_12**6)

test_target = raw_data.iloc[118888:]['CATH_AMT']
test_target2 = pd.get_dummies(raw_data.iloc[118888:]['FDS_CATH'])['Y']

score = pd.concat([test_target.reset_index(drop=True),test_target2.reset_index(drop=True),pd.DataFrame(expected_cath).reset_index(drop=True)[0]],axis=1)

######캡 씌우기#######
threshold = 18488000


score.loc[score['CATH_AMT']>threshold, 'CATH_AMT']=threshold
score.sort_values(0, ascending=False, inplace=True)
score=score.reset_index().drop('index', axis=1).copy()

####################################################################################
n_group=10
group=['G01','G02','G03','G04','G05','G06','G07','G08','G09','G10']

bins=round(len(score)/n_group) 
bins_1=len(score)-bins*(n_group-1) 

group_list=[]
for i in range(n_group):
    if i <n_group-1: 
        group_list.append(np.repeat(group[i], bins).tolist())
    else: 
        group_list.append(np.repeat(group[i], bins_1).tolist())
    
import itertools

group_list=list(itertools.chain(*group_list))
score['group']=pd.Series(group_list)

function_list=['count','sum','mean']
table2=score.groupby(['group']).agg(function_list).reset_index()

table2

Unnamed: 0_level_0,group,CATH_AMT,CATH_AMT,CATH_AMT,Y,Y,Y,0,0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean
0,G01,1766,436828684,247354.8607,1766,103,0.05832,1766,656045390.6211,371486.63116
1,G02,1766,65697048,37201.04643,1766,32,0.01812,1766,276142318.48392,156365.97876
2,G03,1766,83705279,47398.23273,1766,27,0.01529,1766,206300110.04995,116817.72936
3,G04,1766,34777908,19693.03964,1766,17,0.00963,1766,166431553.22082,94242.10262
4,G05,1766,52364998,29651.75425,1766,18,0.01019,1766,137097524.49242,77631.66732
5,G06,1766,5627326,3186.48131,1766,11,0.00623,1766,113138161.59937,64064.64417
6,G07,1766,26016654,14731.96716,1766,7,0.00396,1766,93900656.3403,53171.37958
7,G08,1766,19716343,11164.40713,1766,7,0.00396,1766,76508179.48212,43322.86494
8,G09,1766,3727350,2110.61721,1766,4,0.00227,1766,56091075.39965,31761.65085
9,G10,1767,10480933,5931.48444,1767,3,0.0017,1767,28239654.20016,15981.69451


In [34]:
sum(table2['CATH_AMT']['sum'][0:3])/sum(table2['CATH_AMT']['sum'])

0.7933377668130218

In [31]:
sum(table2[0]['sum'])

1809894623.8898053

In [139]:
sum(table2[0]['sum'][0:3])

1210817600.0083194