In [1]:
from sklearn.model_selection import KFold
import lightgbm

import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2
import os
import pandas as pd
import numpy as np
import joblib

# train_folder = 'gdrive/My Drive/DACON/data/train/'
test_folder = 'data/test/'
# train_label_path = 'gdrive/My Drive/DACON/data/train_label.csv'

# Data loading

In [2]:
train = pd.read_feather('data/train_preprocessed.ftr').set_index('file_name', drop=True)
y = train['label']
train.drop('label',axis=1,inplace=True)

train.head()

Unnamed: 0_level_0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5080,V5084,V5085,V5086,V5087,V5088,V5089,V5090,V5118,V5119
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.469574,8.722739,8.686953,8.677701,8.696935,215.779134,148.857105,-8.951266e-20,0.0,-0.000694,...,43.192021,110.922023,110.958197,-0.298096,-0.234462,-0.24142,-0.164439,43.197957,-3e-06,85.4
0,30.471422,8.843733,8.724614,8.736648,8.724141,189.935527,186.819255,5.018471e-19,0.0,0.001233,...,43.205571,110.918918,110.930774,-0.311288,-0.229316,-0.230339,-0.174198,43.19491,1.1e-05,85.4
0,30.465795,8.639923,8.69343,8.706842,8.698667,167.172015,227.642581,5.601811e-19,0.0,2.9e-05,...,43.196173,110.923194,110.877289,-0.27529,-0.247143,-0.175712,-0.17952,43.206854,1e-06,85.4
0,30.451257,8.643156,8.7211,8.677412,8.69736,190.645984,181.005102,4.3282759999999996e-19,0.0,-0.001779,...,43.199503,110.822725,110.848307,-0.28678,-0.240937,-0.212229,-0.157415,43.198779,4e-06,85.4
0,30.469449,8.786702,8.718487,8.632532,8.686938,208.447021,202.666961,3.8357149999999997e-19,0.0,-1.9e-05,...,43.207392,110.810634,110.926518,-0.301789,-0.237373,-0.201726,-0.188566,43.204087,2e-06,85.4


In [3]:
# train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
# train_label = pd.read_csv(train_label_path, index_col=0)


# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()
    combined_df = pd.concat(df_list)    
    
    return combined_df

test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=20, nrows=60)

# Model Tuning & Evaluation


In [4]:
parms = {
    'learning_rate' : 0.06,
    'num_leaves' : 400,
    'n_estimators' : 300,
    'max_depth': -1,
    'min_child_weight' : 3, 
    'subsample' : 0.8,
    'colsample_bytree' : 0.5,
    'objective' : 'multiclass',
    'n_jobs': -1
}


# 4FOLD, 3SEED ENSEMBLE
# 총 12개의 모델을 평균내어 예측한다

if not os.path.exists('../2_Code_pred'):
  os.makedirs('../2_Code_pred')

lucky_seed=[4885,1992,1022]

for num,rs in enumerate(lucky_seed):

    kfold = KFold(n_splits=4, random_state = rs, shuffle = True)

    # dacon code
    cv=np.zeros((train.shape[0],198))

    for n, (train_idx, validation_idx) in enumerate(kfold.split(train)):

        x_train, x_validation = train.iloc[train_idx], train.iloc[validation_idx]
        y_train, y_validation = y.iloc[train_idx], y.iloc[validation_idx]

        model = lightgbm.LGBMClassifier(**parms, random_state=rs)

        model.fit(x_train, y_train, eval_set=[(x_validation, y_validation)], early_stopping_rounds= 30,
                  verbose=100) 
        joblib.dump(model, '../2_Code_pred/%s_fold_model_%s.pkl'%(n,rs))

        # CROSS-VALIDATION , EVALUATE CV
        cv[validation_idx,:] = model.predict_proba(x_validation)

Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.335388
Early stopping, best iteration is:
[153]	valid_0's multi_logloss: 0.321477
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.366665
Early stopping, best iteration is:
[148]	valid_0's multi_logloss: 0.353237
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.335919
Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.322287
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.330895
Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.318645
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.365928
Early stopping, best iteration is:
[148]	valid_0's multi_logloss: 0.352863
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.331495
Early stopping, bes

In [8]:
import gc
gc.collect()

116

In [10]:
test = test[train.columns]

In [11]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
models = os.listdir('../2_Code_pred/')
models_list = [x for x in models if x.endswith(".pkl")]
assert len(models_list) ==12
temp_predictions = np.zeros((test.shape[0],198))

for m in models_list:
    model = joblib.load('../2_Code_pred/'+m)
    predict_proba = model.predict_proba(test)
    temp_predictions += predict_proba/12

In [12]:
# dacon code
submission = pd.DataFrame(data=np.zeros((test.shape[0],198)))
submission.index = test.index 
submission.index.name = 'id'
submission+=temp_predictions

submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True)
submission.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,2.5e-05,2.8e-05,2.9e-05,3.2e-05,2.8e-05,2.8e-05,2.8e-05,2.9e-05,3.1e-05,2.5e-05,...,3e-05,2.9e-05,3.1e-05,3e-05,2.6e-05,2.1e-05,3.6e-05,2.1e-05,2.1e-05,0.00032
829,0.000427,0.000364,0.000525,0.000608,0.000437,0.000445,0.000446,0.00047,0.000494,0.000286,...,0.000467,0.000452,0.000491,0.000469,0.000405,0.000298,0.000677,0.000715,0.000347,0.001569
830,3.8e-05,4.4e-05,4.1e-05,3.9e-05,4.2e-05,4.3e-05,4.3e-05,4.4e-05,5.2e-05,3.3e-05,...,4.5e-05,4.3e-05,4.7e-05,4.5e-05,3.9e-05,2.7e-05,4.2e-05,6.5e-05,0.000192,0.000122
831,0.000291,0.000313,0.000521,0.000311,0.024578,0.01849,0.007826,0.010957,0.000516,0.000201,...,0.000238,0.000258,0.000241,0.000363,0.000206,0.000184,0.000349,0.000183,0.000164,0.000388
832,9.7e-05,9.7e-05,9.6e-05,0.000142,4.2e-05,4.4e-05,3.7e-05,4.2e-05,4e-05,0.000296,...,3.8e-05,3.7e-05,4e-05,3.8e-05,3.3e-05,3.1e-05,6.8e-05,3.5e-05,6.2e-05,6.3e-05
