In [1]:
import glob
import pandas as pd 
import numpy as np

import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import gc
import joblib # 모델을 저장하고 불러오는 역할

In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

train_list = [f_dir.split('\\')[-1] for f_dir in glob.glob(train_folder + '*.csv')]
test_list = [f_dir.split('\\')[-1] for f_dir in glob.glob(test_folder + '*.csv')]
train_label = pd.read_csv(train_label_path, index_col=0)

## Load data

In [3]:
%%time

# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정 (15초)
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=15, nrows=65):
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df


train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label)
train = train.loc[:, train.nunique() != 1]  # drop columns that have only one unique value

X_train = train.drop(['label'], axis=1)
y_train = train['label']

X_train.shape, y_train.shape

Wall time: 3min 52s


((41350, 3331), (41350,))

## Train model (RandomForestClassifier)

In [4]:
gc.collect()  # to resolve error related to memory

22

In [5]:
model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1,
                               n_estimators=300, min_samples_leaf=5, max_depth=20)
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  9.4min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [6]:
# GridSearchCV - Not run (lack of memory)

# params = {
#     'n_estimators':[100],
#     'max_depth':[8, 12, 50],
#     'min_samples_leaf':[8, 18, 100],
#     'min_samples_split':[8, 20, 100]
# }

# model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)
# grid_cv = GridSearchCV(model, param_grid=params, cv=5, n_jobs=-1)
# grid_cv.fit(X_train, y_train)

# print('best parameter set :\n', grid_cv.best_params_)
# print(f'best accuracy : {grid_cv.best_score_:.4f}')

## Check feature importance

In [7]:
ftr_importances_val = model.feature_importances_
ftr_importances = pd.Series(ftr_importances_val, index=X_train.columns)

# columns whose ftr_importance is 0 (572 columns)
ftr_importances[ftr_importances==0].index

Index(['V0046', 'V0050', 'V0220', 'V0225', 'V0226', 'V0230', 'V0233', 'V0237',
       'V0238', 'V0244',
       ...
       'V4743', 'V4746', 'V4747', 'V4748', 'V4752', 'V4757', 'V4763', 'V4775',
       'V4857', 'V4858'],
      dtype='object', length=572)

In [8]:
# Top 10 columns according to ftr_importance
ftr_top10 = ftr_importances.sort_values(ascending=False)[:10]

print(ftr_top10)

V2212    0.009842
V2208    0.009131
V2213    0.008624
V2215    0.008470
V2586    0.007850
V2587    0.007525
V2216    0.007508
V2079    0.007224
V2211    0.007189
V2185    0.007071
dtype: float64


## Preprocess data & Save the results

In [9]:
output_df = X_train.loc[:, ftr_importances[ftr_importances!=0].index]  # drop 572 columns whose ftr_importance is 0
output_df['label'] = y_train
output_df

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5085,V5086,V5087,V5088,V5089,V5090,V5115,V5118,V5119,label
0,30.469574,8.722739,8.686953,8.677701,8.696935,215.779134,148.857105,-8.951266e-20,0.0,-0.000694,...,110.958197,-0.298096,-0.234462,-0.241420,-0.164439,43.197957,60.0,-0.000003,85.4,110
0,30.471422,8.843733,8.724614,8.736648,8.724141,189.935527,186.819255,5.018471e-19,0.0,0.001233,...,110.930774,-0.311288,-0.229316,-0.230339,-0.174198,43.194910,60.0,0.000011,85.4,110
0,30.465795,8.639923,8.693430,8.706842,8.698667,167.172015,227.642581,5.601811e-19,0.0,0.000029,...,110.877289,-0.275290,-0.247143,-0.175712,-0.179520,43.206854,60.0,0.000001,85.4,110
0,30.451257,8.643156,8.721100,8.677412,8.697360,190.645984,181.005102,4.328276e-19,0.0,-0.001779,...,110.848307,-0.286780,-0.240937,-0.212229,-0.157415,43.198779,60.0,0.000004,85.4,110
0,30.469449,8.786702,8.718487,8.632532,8.686938,208.447021,202.666961,3.835715e-19,0.0,-0.000019,...,110.926518,-0.301789,-0.237373,-0.201726,-0.188566,43.204087,60.0,0.000002,85.4,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.495335,8.628012,8.693398,8.727568,8.680550,229.455141,211.719048,3.435239e-19,0.0,-0.001508,...,110.894654,-0.299185,-0.233951,-0.190047,-0.154182,43.205959,60.0,-0.000004,85.4,156
99,30.472037,8.554523,8.689969,8.683387,8.657360,220.367669,219.374620,-6.818763e-21,0.0,0.000018,...,110.853331,-0.290297,-0.237829,-0.195119,-0.171931,43.205621,60.0,0.000007,85.4,156
99,30.476295,8.681108,8.686948,8.685183,8.669547,241.194914,191.160434,-8.764614e-20,0.0,-0.001748,...,110.824375,-0.287311,-0.232160,-0.207292,-0.140508,43.205920,60.0,0.000003,85.4,156
99,30.493136,8.623526,8.713625,8.736684,8.714882,219.724120,221.304790,1.353404e-19,0.0,-0.001416,...,110.951080,-0.288463,-0.234679,-0.180021,-0.145725,43.204470,60.0,-0.000019,85.4,156


In [10]:
output_df.to_csv('train_preprocessed.csv')  # about 1.7GB

In [11]:
ftr_importances.to_csv('train_feature_importances.csv')