In [1]:
import os
import datetime, time
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from utils import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib # 모델을 저장하고 불러오는 역할
import xgboost as xgb

submit = True
model_name = "XGBClassifier"
# RandomForestClassifier, XGBClassifier

## Set Path

In [2]:
train_folder = '../data/train_all/'
test_folder = '../data/test_all/'
train_label_path = '../data/train_label.csv'
model_path = '../model/'+model_name+'_model.pkl'
submission_folder = '../submission/'

## Load Files

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

## Train

In [None]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [73]:
now = datetime.datetime.now()
X_train = train.drop(['label'], axis=1)
y_train = train['label']
if model == "RandomForestClassifier":
    model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)
    model.fit(X_train, y_train)
elif model == "XGBClassifier":
    model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="logloss", random_state=42, tree_method='gpu_hist')
    print(y_train)
    model.fit(X_train, y_train)

joblib.dump(model, model_path)

print("train time:" + str(datetime.datetime.now()-now))

train time:0:00:00.970406


## Prediction

In [74]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [75]:
model = joblib.load(model_path) 
if submit:
    pred = model.predict_proba(test)
else:
    pred = model.predict_proba(X_train)
    accuracy_score(X_train, pred)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    0.1s finished


## Submission

In [76]:
def return_now_runtime():
    now = time.localtime()
    return "%04d-%02d-%02d %02d-%02d-%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)

In [77]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('../submission_'+model_name+'_'+return_now_runtime()+'.csv', index=True) #제출 파일 만들기

In [78]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=1,
                       warm_start=False)