In [1]:
import os
import datetime, time
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from utils import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)
from utils import data_loader_v2_2

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
import joblib # 모델을 저장하고 불러오는 역할
import xgboost as xgb

submit = True
model_name = "XGBClassifier"
# RandomForestClassifier, XGBClassifier, SVC, SGDClassifier

## Set Path

In [2]:
train_folder = '../data/train_all/'
test_folder = '../data/test_all/'
train_label_path = '../data/train_label.csv'
model_path = '../model/'+model_name+'_model_v2.pkl'
submission_folder = '../submission/'

## Load Files

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, 
                       files, 
                       folder='', 
                       train_label=None, 
                       event_time=10, 
                       nrows=60,
                       skiprows=0):   
    func_fixed = partial(func, 
                         folder=folder, 
                         train_label=train_label, 
                         event_time=event_time, 
                         nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()
    for d in df_list:
        print(d.shape)
    combined_df = pd.concat(df_list, axis=0)
    return combined_df

## Train

In [None]:
train = data_loader_all_v2(data_loader_v2, 
                           train_list, 
                           folder=train_folder, 
                           train_label=train_label, 
                           event_time=15, 
                           nrows=20)

In [6]:
train.shape

(4135, 5122)

In [None]:
now = datetime.datetime.now()
X_train = train.drop(['label'], axis=1)
y_train = train['label']
if model_name == "RandomForestClassifier":
    model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)
elif model_name == "XGBClassifier":
    model = xgb.XGBClassifier(objective="multi:softprob", 
                              eval_metric="logloss", 
                              random_state=42, 
                              tree_method='gpu_hist')
#     model = xgb.XGBClassifier()
#     param_grid = {'max_depth':[5,8],
#                   'min_child_weight':[3,5],
#                   'gamma':[3],
#                   'tree_method':['gpu_hist'],
#                   'eval_metric':['logloss'],
#                   'objective':['multi:softprob']}
#     cv=KFold(n_splits=6, shuffle=True)
#     gcv=GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='neg_log_loss', n_jobs=6)
#     gcv.fit(X_train, y_train)
#     print('final params', gcv.best_params_)
#     print('best score', gcv.best_score_)
elif model_name == "SGDClassifier":
    model = SGDClassifier(loss='log', max_iter=100, n_jobs=-1)
    
model.fit(X_train, y_train)

joblib.dump(model, model_path)

print("train time:" + str(datetime.datetime.now()-now))

In [None]:
train = data_loader_all_v2(data_loader_v2_2, 
                           train_list, 
                           folder=train_folder, 
                           train_label=train_label, 
                           event_time=15, 
                           nrows=20,
                           skiprows=np.random.randint(560))

(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)
(6, 5122)


In [19]:
train.shape

(60, 23779)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", 
                              eval_metric="logloss", 
                              random_state=42, 
                              tree_method='gpu_hist')
for i in range(5):
    train = data_loader_all_v2(data_loader_v2_2, 
                               train_list, 
                               folder=train_folder, 
                               train_label=train_label, 
                               event_time=15, 
                               nrows=35,
                               skiprows=np.random.randint(560))
    print(train.shape)
    print(train)
    X_train = train.drop(['label'], axis=1)
    y_train = train['label']
    model.fit(X_train, y_train)
    print(i, "end")

## Prediction

In [8]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [9]:
model = joblib.load(model_path) 
if submit:
    pred = model.predict_proba(test)
else:
    pred = model.predict_proba(X_train)
    accuracy_score(X_train, pred)

## Submission

In [10]:
def return_now_runtime():
    now = time.localtime()
    return "%04d-%02d-%02d %02d-%02d-%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)

In [11]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('../submission_'+model_name+'_'+return_now_runtime()+'.csv', index=True) #제출 파일 만들기

In [12]:
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)