1:1 비율 down sampling

FE2의 feature

In [1]:
import pandas as pd
from lightgbm import Booster
import lightgbm as lgb
import copy
import time
import json

In [2]:
train_path = '/home/workspace/user-workspace/slim_train.parquet'
test_path ='/home/workspace/user-workspace/slim_test.parquet'
encoder = '/home/workspace/user-workspace/cat_encoder.json'
decoder = '/home/workspace/user-workspace/inverse_cat_encoder.json'
data_dir = '/home/workspace/user-workspace/junheon/data/task150/'
model_dir = '/home/workspace/user-workspace/junheon/model/task150/'
result_dir = '/home/workspace/user-workspace/junheon/result/task150/'
submission_dir = '/home/workspace/user-workspace/prediction/'

In [3]:
bagging_size = 5
prefix = "ex12"

In [4]:
features = [
    "basic_feature",
    "month_day",
    "pay_amt",
    "PCA5",
    "NMF5",
    "age_bin",
    "phone_cnt_wrt_seq",
    "seq_count",
    "is_targeted"
]

In [5]:
categorical_features = [
    "COMMC_CLF", "NPAY_YN", "PAY_MTHD_CD", "ARS_AUTHTI_YN", "GNDR", "FOREI_YN",  "AUTHTI_CLF_FLG", 
    "SVC_CLF_NM", "CP_M_CLF_NM", "CP_S_CLF_NM", "month", "day", "AGE_bin"
]

In [6]:
remove_features = ["AGE"]

# merge data

### train

In [7]:
for seed in range(bagging_size):
    df_list = []
    for name in features:
        feature_df = pd.read_parquet(f"{data_dir}{name}_{seed}.parquet").set_index("id")
        df_list.append(feature_df)
    df = pd.concat(df_list, axis=1)
    df = df.drop(columns=remove_features, axis=1)
    df.to_parquet(f"{data_dir}{prefix}_train_{seed}.parquet")

### test

In [8]:
df_list = []
for name in features:
    feature_df = pd.read_parquet(f"{data_dir}{name}_test.parquet").set_index("id")
    df_list.append(feature_df)
df = pd.concat(df_list, axis=1)
df = df.drop(remove_features, axis=1)
df.to_parquet(f"{data_dir}{prefix}_test.parquet")

# Modeling

In [9]:
model_params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "num_leaves": 255,
    "max_depth": 8,
    "min_child_samples": 200,
    "subsample": 0.9,
    "subsample_freq": 1,
    "colsample_bytree": 0.5,
    "min_child_weight": 0,
    "subsample_for_bin": 1000000,
    "min_split_gain": 0,
    "reg_lambda": 0,
    "verbose": 0,
    "seed": 777
}

train_params = {
    "num_boost_round": 2000,
    "early_stopping_rounds": 30
}

### train and valid

In [10]:
boosters = []
train_results = []

In [None]:
for seed in range(bagging_size):
    start_time = time.time()
    
    train_df = pd.read_parquet(f"{data_dir}{prefix}_train_{seed}.parquet")
    valid_df = train_df[train_df['month']==10]
    train_df = train_df[train_df['month']!=10]
    train_dataset = lgb.Dataset(train_df.loc[:, train_df.columns != "target"], label=train_df["target"].values)
    valid_dataset = lgb.Dataset(valid_df.loc[:, valid_df.columns != "target"], label=valid_df["target"].values)
    eval_results = {}
    
    model: Booster = lgb.train(model_params,
                              train_dataset,
                              categorical_feature=categorical_features,
                              valid_sets=[train_dataset, valid_dataset],
                              valid_names=['train', 'valid'],
                              evals_result=eval_results,
                              **train_params)
        
    best_iteration = model.best_iteration
    
    params = copy.deepcopy(train_params)
    params["num_boost_round"] = best_iteration
    if "early_stopping_rounds" in params:
        del params["early_stopping_rounds"]
        
    train_df = pd.read_parquet(f"{data_dir}{prefix}_train_{seed}.parquet")
    train_dataset = lgb.Dataset(train_df.loc[:, train_df.columns != "target"], label=train_df["target"].values)
    model = lgb.train(model_params,
                     train_dataset,
                     categorical_feature=categorical_features,
                     **params)
    
    model.save_model(f"{model_dir}{prefix}_model_{seed}.txt")
    boosters.append(model)
    
    result = dict()
    result['train_auc'] = eval_results['train']['auc'][best_iteration-1]
    result['valid_auc'] = eval_results['valid']['auc'][best_iteration-1]
    result['best_iteration'] = best_iteration
    result['train_time'] = time.time() - start_time
    result['feature_importance'] = {name: int(score) for name, score in zip(model.feature_name(), model.feature_importance())}
    result['feature_importance'] = {key: value for key, value in sorted(result['feature_importance'].items(), key=lambda item: item[1], reverse=True)}
    train_results.append(result)

New categorical_feature is ['AGE_bin', 'ARS_AUTHTI_YN', 'AUTHTI_CLF_FLG', 'COMMC_CLF', 'CP_M_CLF_NM', 'CP_S_CLF_NM', 'FOREI_YN', 'GNDR', 'NPAY_YN', 'PAY_MTHD_CD', 'SVC_CLF_NM', 'day', 'month']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




[1]	train's auc: 0.97497	valid's auc: 0.975907
Training until validation scores don't improve for 30 rounds
[2]	train's auc: 0.97727	valid's auc: 0.977723
[3]	train's auc: 0.974851	valid's auc: 0.974909
[4]	train's auc: 0.978165	valid's auc: 0.978101
[5]	train's auc: 0.985248	valid's auc: 0.986293
[6]	train's auc: 0.983063	valid's auc: 0.984229
[7]	train's auc: 0.986479	valid's auc: 0.987029
[8]	train's auc: 0.985621	valid's auc: 0.986446
[9]	train's auc: 0.986283	valid's auc: 0.987015
[10]	train's auc: 0.985861	valid's auc: 0.98673
[11]	train's auc: 0.986134	valid's auc: 0.987046
[12]	train's auc: 0.987488	valid's auc: 0.988125
[13]	train's auc: 0.986835	valid's auc: 0.987429
[14]	train's auc: 0.986305	valid's auc: 0.986857
[15]	train's auc: 0.985969	valid's auc: 0.98672
[16]	train's auc: 0.986211	valid's auc: 0.9869
[17]	train's auc: 0.987175	valid's auc: 0.987685
[18]	train's auc: 0.987492	valid's auc: 0.988019
[19]	train's auc: 0.987592	valid's auc: 0.988087
[20]	train's auc: 0.987

[67]	train's auc: 0.989598	valid's auc: 0.990029
[68]	train's auc: 0.989647	valid's auc: 0.990081
[69]	train's auc: 0.989718	valid's auc: 0.990142
[70]	train's auc: 0.989817	valid's auc: 0.990232
[71]	train's auc: 0.989812	valid's auc: 0.990237
[72]	train's auc: 0.989888	valid's auc: 0.990304
[73]	train's auc: 0.989871	valid's auc: 0.990295
[74]	train's auc: 0.98987	valid's auc: 0.990298
[75]	train's auc: 0.989839	valid's auc: 0.990264
[76]	train's auc: 0.989877	valid's auc: 0.990302
[77]	train's auc: 0.989961	valid's auc: 0.990373
[78]	train's auc: 0.990012	valid's auc: 0.990414
[79]	train's auc: 0.990067	valid's auc: 0.990467
[80]	train's auc: 0.990029	valid's auc: 0.990417
[81]	train's auc: 0.989991	valid's auc: 0.990378
[82]	train's auc: 0.98997	valid's auc: 0.990361
[83]	train's auc: 0.989921	valid's auc: 0.990308
[84]	train's auc: 0.989914	valid's auc: 0.9903
[85]	train's auc: 0.989941	valid's auc: 0.990327
[86]	train's auc: 0.989972	valid's auc: 0.990362
[87]	train's auc: 0.9900

[133]	train's auc: 0.990605	valid's auc: 0.990862
[134]	train's auc: 0.990601	valid's auc: 0.990858
[135]	train's auc: 0.990589	valid's auc: 0.990842
[136]	train's auc: 0.990603	valid's auc: 0.990854
[137]	train's auc: 0.990646	valid's auc: 0.99089
[138]	train's auc: 0.990647	valid's auc: 0.990886
[139]	train's auc: 0.990678	valid's auc: 0.990913
[140]	train's auc: 0.990691	valid's auc: 0.990922
[141]	train's auc: 0.990716	valid's auc: 0.990947
[142]	train's auc: 0.990718	valid's auc: 0.990948
[143]	train's auc: 0.990727	valid's auc: 0.990956
[144]	train's auc: 0.990714	valid's auc: 0.990933
[145]	train's auc: 0.990742	valid's auc: 0.990956
[146]	train's auc: 0.990763	valid's auc: 0.990972
[147]	train's auc: 0.990762	valid's auc: 0.990969
[148]	train's auc: 0.990761	valid's auc: 0.990967
[149]	train's auc: 0.990764	valid's auc: 0.99097
[150]	train's auc: 0.990755	valid's auc: 0.990954
[151]	train's auc: 0.990775	valid's auc: 0.990972
[152]	train's auc: 0.990818	valid's auc: 0.991008
[1

[200]	train's auc: 0.991249	valid's auc: 0.991304
[201]	train's auc: 0.991255	valid's auc: 0.991308
[202]	train's auc: 0.991263	valid's auc: 0.991313
[203]	train's auc: 0.991266	valid's auc: 0.991316
[204]	train's auc: 0.991271	valid's auc: 0.99132
[205]	train's auc: 0.991286	valid's auc: 0.991334
[206]	train's auc: 0.991297	valid's auc: 0.991341
[207]	train's auc: 0.991312	valid's auc: 0.991353
[208]	train's auc: 0.991329	valid's auc: 0.991369
[209]	train's auc: 0.991334	valid's auc: 0.991373
[210]	train's auc: 0.99135	valid's auc: 0.991382
[211]	train's auc: 0.991367	valid's auc: 0.9914
[212]	train's auc: 0.991369	valid's auc: 0.991399
[213]	train's auc: 0.99139	valid's auc: 0.991417
[214]	train's auc: 0.991396	valid's auc: 0.991422
[215]	train's auc: 0.991419	valid's auc: 0.99144
[216]	train's auc: 0.991439	valid's auc: 0.991452
[217]	train's auc: 0.991446	valid's auc: 0.991457
[218]	train's auc: 0.991459	valid's auc: 0.991466
[219]	train's auc: 0.991461	valid's auc: 0.991469
[220]	

[266]	train's auc: 0.991759	valid's auc: 0.991659
[267]	train's auc: 0.991763	valid's auc: 0.991661
[268]	train's auc: 0.991773	valid's auc: 0.991671
[269]	train's auc: 0.991789	valid's auc: 0.991682
[270]	train's auc: 0.991792	valid's auc: 0.991679
[271]	train's auc: 0.9918	valid's auc: 0.991683
[272]	train's auc: 0.991818	valid's auc: 0.991697
[273]	train's auc: 0.991834	valid's auc: 0.991707
[274]	train's auc: 0.991849	valid's auc: 0.991718
[275]	train's auc: 0.991855	valid's auc: 0.991722
[276]	train's auc: 0.991854	valid's auc: 0.991716
[277]	train's auc: 0.991862	valid's auc: 0.991725
[278]	train's auc: 0.991868	valid's auc: 0.991728
[279]	train's auc: 0.991866	valid's auc: 0.991725
[280]	train's auc: 0.991874	valid's auc: 0.991731
[281]	train's auc: 0.99189	valid's auc: 0.991741
[282]	train's auc: 0.991893	valid's auc: 0.991743
[283]	train's auc: 0.991903	valid's auc: 0.991748
[284]	train's auc: 0.991908	valid's auc: 0.991748
[285]	train's auc: 0.991914	valid's auc: 0.991752
[28

[332]	train's auc: 0.992308	valid's auc: 0.991985
[333]	train's auc: 0.992318	valid's auc: 0.991994
[334]	train's auc: 0.992322	valid's auc: 0.991993
[335]	train's auc: 0.992325	valid's auc: 0.991996
[336]	train's auc: 0.99233	valid's auc: 0.991997
[337]	train's auc: 0.992336	valid's auc: 0.991998
[338]	train's auc: 0.992349	valid's auc: 0.992007
[339]	train's auc: 0.992361	valid's auc: 0.992017
[340]	train's auc: 0.992367	valid's auc: 0.992022
[341]	train's auc: 0.992369	valid's auc: 0.99202
[342]	train's auc: 0.992375	valid's auc: 0.992027
[343]	train's auc: 0.992387	valid's auc: 0.992036
[344]	train's auc: 0.992392	valid's auc: 0.992038
[345]	train's auc: 0.992398	valid's auc: 0.992042
[346]	train's auc: 0.992404	valid's auc: 0.992047
[347]	train's auc: 0.992416	valid's auc: 0.992055
[348]	train's auc: 0.992423	valid's auc: 0.992056
[349]	train's auc: 0.992429	valid's auc: 0.99206
[350]	train's auc: 0.992434	valid's auc: 0.992063
[351]	train's auc: 0.99244	valid's auc: 0.992066
[352

[399]	train's auc: 0.99276	valid's auc: 0.99225
[400]	train's auc: 0.992765	valid's auc: 0.992251
[401]	train's auc: 0.992772	valid's auc: 0.992252
[402]	train's auc: 0.992777	valid's auc: 0.992254
[403]	train's auc: 0.992782	valid's auc: 0.992257
[404]	train's auc: 0.992792	valid's auc: 0.992262
[405]	train's auc: 0.992798	valid's auc: 0.992264
[406]	train's auc: 0.992805	valid's auc: 0.992266
[407]	train's auc: 0.992812	valid's auc: 0.992272
[408]	train's auc: 0.992817	valid's auc: 0.992272
[409]	train's auc: 0.992822	valid's auc: 0.992273
[410]	train's auc: 0.992824	valid's auc: 0.992275
[411]	train's auc: 0.992827	valid's auc: 0.992277
[412]	train's auc: 0.992838	valid's auc: 0.992283
[413]	train's auc: 0.992847	valid's auc: 0.992288
[414]	train's auc: 0.992853	valid's auc: 0.992293
[415]	train's auc: 0.992855	valid's auc: 0.992294
[416]	train's auc: 0.99286	valid's auc: 0.992296
[417]	train's auc: 0.992864	valid's auc: 0.992297
[418]	train's auc: 0.992868	valid's auc: 0.992299
[41

[465]	train's auc: 0.99313	valid's auc: 0.992408
[466]	train's auc: 0.993135	valid's auc: 0.992409
[467]	train's auc: 0.993139	valid's auc: 0.992411
[468]	train's auc: 0.993143	valid's auc: 0.992411
[469]	train's auc: 0.993148	valid's auc: 0.992413
[470]	train's auc: 0.993154	valid's auc: 0.992416
[471]	train's auc: 0.993161	valid's auc: 0.99242
[472]	train's auc: 0.993164	valid's auc: 0.992421
[473]	train's auc: 0.99317	valid's auc: 0.992426
[474]	train's auc: 0.993175	valid's auc: 0.992427
[475]	train's auc: 0.993181	valid's auc: 0.992427
[476]	train's auc: 0.993183	valid's auc: 0.992427
[477]	train's auc: 0.993189	valid's auc: 0.992431
[478]	train's auc: 0.993192	valid's auc: 0.992433
[479]	train's auc: 0.993195	valid's auc: 0.992435
[480]	train's auc: 0.993204	valid's auc: 0.992441
[481]	train's auc: 0.993211	valid's auc: 0.992445
[482]	train's auc: 0.993214	valid's auc: 0.992446
[483]	train's auc: 0.993219	valid's auc: 0.992449
[484]	train's auc: 0.993222	valid's auc: 0.99245
[485

[532]	train's auc: 0.993446	valid's auc: 0.992538
[533]	train's auc: 0.993452	valid's auc: 0.992541
[534]	train's auc: 0.993454	valid's auc: 0.992543
[535]	train's auc: 0.993461	valid's auc: 0.992544
[536]	train's auc: 0.993464	valid's auc: 0.992546
[537]	train's auc: 0.993472	valid's auc: 0.992549
[538]	train's auc: 0.993476	valid's auc: 0.992548
[539]	train's auc: 0.99348	valid's auc: 0.99255
[540]	train's auc: 0.993484	valid's auc: 0.992552
[541]	train's auc: 0.993488	valid's auc: 0.992553
[542]	train's auc: 0.993491	valid's auc: 0.992555
[543]	train's auc: 0.993493	valid's auc: 0.992556
[544]	train's auc: 0.993496	valid's auc: 0.992558
[545]	train's auc: 0.9935	valid's auc: 0.992559
[546]	train's auc: 0.993505	valid's auc: 0.992559
[547]	train's auc: 0.993509	valid's auc: 0.99256
[548]	train's auc: 0.993512	valid's auc: 0.992563
[549]	train's auc: 0.993517	valid's auc: 0.992564
[550]	train's auc: 0.99352	valid's auc: 0.992566
[551]	train's auc: 0.993525	valid's auc: 0.992567
[552]	

[599]	train's auc: 0.99374	valid's auc: 0.992642
[600]	train's auc: 0.993742	valid's auc: 0.992642
[601]	train's auc: 0.993748	valid's auc: 0.992642
[602]	train's auc: 0.993751	valid's auc: 0.992643
[603]	train's auc: 0.993756	valid's auc: 0.992645
[604]	train's auc: 0.99376	valid's auc: 0.992647
[605]	train's auc: 0.993764	valid's auc: 0.992649
[606]	train's auc: 0.993767	valid's auc: 0.99265
[607]	train's auc: 0.99377	valid's auc: 0.992651
[608]	train's auc: 0.993776	valid's auc: 0.992652
[609]	train's auc: 0.993778	valid's auc: 0.992652
[610]	train's auc: 0.993782	valid's auc: 0.992653
[611]	train's auc: 0.993787	valid's auc: 0.992655
[612]	train's auc: 0.993788	valid's auc: 0.992655
[613]	train's auc: 0.993792	valid's auc: 0.992656
[614]	train's auc: 0.993795	valid's auc: 0.992656
[615]	train's auc: 0.9938	valid's auc: 0.992657
[616]	train's auc: 0.993806	valid's auc: 0.99266
[617]	train's auc: 0.993809	valid's auc: 0.992662
[618]	train's auc: 0.993812	valid's auc: 0.992662
[619]	t

[665]	train's auc: 0.993982	valid's auc: 0.992707
[666]	train's auc: 0.993986	valid's auc: 0.992708
[667]	train's auc: 0.993989	valid's auc: 0.992709
[668]	train's auc: 0.993991	valid's auc: 0.99271
[669]	train's auc: 0.993996	valid's auc: 0.992712
[670]	train's auc: 0.994002	valid's auc: 0.992713
[671]	train's auc: 0.994006	valid's auc: 0.992712
[672]	train's auc: 0.994011	valid's auc: 0.992714
[673]	train's auc: 0.994014	valid's auc: 0.992714
[674]	train's auc: 0.994017	valid's auc: 0.992715
[675]	train's auc: 0.994023	valid's auc: 0.992715
[676]	train's auc: 0.994027	valid's auc: 0.992716
[677]	train's auc: 0.994028	valid's auc: 0.992716
[678]	train's auc: 0.994034	valid's auc: 0.992717
[679]	train's auc: 0.994036	valid's auc: 0.992718
[680]	train's auc: 0.994039	valid's auc: 0.992719
[681]	train's auc: 0.994043	valid's auc: 0.992719
[682]	train's auc: 0.994049	valid's auc: 0.99272
[683]	train's auc: 0.994051	valid's auc: 0.992722
[684]	train's auc: 0.994055	valid's auc: 0.992723
[6

[732]	train's auc: 0.994223	valid's auc: 0.99277
[733]	train's auc: 0.994226	valid's auc: 0.992771
[734]	train's auc: 0.994229	valid's auc: 0.992771
[735]	train's auc: 0.99423	valid's auc: 0.992771
[736]	train's auc: 0.994235	valid's auc: 0.992772


In [None]:
json.dump(train_results, open(f"{result_dir}{prefix}.result.json", 'w'), indent=2)

# Predict & Submission

In [None]:
test_df = pd.read_parquet(f"{data_dir}{prefix}_test.parquet")

In [None]:
submission = pd.read_parquet(test_path)[['TRD_NO']]

In [None]:
predictions = []

In [None]:
predictions = []
for i, booster in enumerate(boosters):
    print(f"prediction {i}")
    predictions.append(booster.predict(test_df).tolist())

In [None]:
for i, predict in enumerate(predictions):
    submission[f"target_{i}"] = predict

In [None]:
columns = [f"target_{i}" for i in range(len(predictions))]

In [None]:
columns

In [None]:
submission['target'] = submission[columns].mean(1)
submission = submission.drop(columns=columns)

In [None]:
submission.to_feather(f"{submission_dir}prediction.feather")

In [None]:
submission

In [None]:
from nipa.taskSubmit import nipa_submit
import os

team_id = "1390"
task_no= "150"
prediction_path = f"{submission_dir}prediction.feather"
# 파일 존재 여부 확인
print("is file: ", os.path.isfile(prediction_path))

# 제출 성공
nipa_submit(team_id=team_id,
            task_no=task_no,
            result=prediction_path
           )
