### 사전설치

In [None]:
# !pip install pycaret

### DATA LOAD 

In [9]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import * #pycaret 중 분류에 필요한 모듈

In [5]:
path = './data/'
train = pd.read_csv(path +'train.csv')
test = pd.read_csv(path + 'test_x.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [6]:
print(train.shape)
print(test.shape)
print(submission.shape)

(45532, 78)
(11383, 77)
(11383, 2)


### 실험환경구축
PyCaret에서는 모델 학습 전 실험 환경을 구축 해주어야 합니다. setup 함수를 통해 환경을 구축할 수 있습니다.  
setup 단계에서는 PyCaret이 자동으로 컬럼 형태를 인식합니다. 그 후 사용자에게 제대로 인식되었는지 확인을 받게 됩니다. 그 때 enter를 눌러주시면 됩니다.  
또한 주어진 데이터의 얼마를 사용하여 train / validation을 구축할지 묻게 되는데, 전체 데이터를 사용하고 싶다면 enter 눌러주시면 됩니다.  

In [10]:
# 'voted' 컬럼이 예측 대상이므로 target 인자에 명시
clf = setup(data = train, target = 'voted')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,6329
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 78)"
4,Missing Values,False
5,Numeric Features,42
6,Categorical Features,35
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### 모델 학습 및 비교 (Train models and compare)
환경 구축을 했으니 PyCaret에서 제공하는 기본 모델에 대해 학습하고 비교해보겠습니다.  
compared_models 함수를 통해 15개의 기본 모델을 학습하고 성능을 비교할 수 있습니다.  
AUC 기준으로 성능이 가장 좋은 3개의 모델을 추려내어 저장해보겠습니다. 본 대회 평가지표가 AUC이기 때문에 AUC 기준으로 모델을 선정합니다.  

In [11]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.6933,0.7637,0.6429,0.7593,0.6962,0.3908,0.3963,0.6109
1,CatBoost Classifier,0.692,0.7632,0.6552,0.7499,0.6993,0.3865,0.3902,16.2719
2,Gradient Boosting Classifier,0.6922,0.7622,0.6377,0.7607,0.6938,0.3891,0.3953,21.7696
3,Linear Discriminant Analysis,0.6903,0.7597,0.6591,0.7452,0.6995,0.3825,0.3856,0.8446
4,Extra Trees Classifier,0.6901,0.7582,0.6447,0.7531,0.6946,0.384,0.3888,1.322
5,Ada Boost Classifier,0.6887,0.7558,0.6538,0.7456,0.6966,0.3797,0.3832,5.4131
6,Extreme Gradient Boosting,0.6765,0.7448,0.6616,0.7233,0.691,0.353,0.3546,3.2873
7,Random Forest Classifier,0.6553,0.7113,0.6059,0.7195,0.6578,0.3156,0.3204,0.2293
8,Decision Tree Classifier,0.6099,0.6061,0.6465,0.6424,0.6444,0.2124,0.2124,1.6256
9,Naive Bayes,0.4555,0.5274,0.0134,0.589,0.0262,0.0021,0.0098,0.1431


### 모델 앙상블 (Model Ensemble)
학습된 3개의 모델을 앙상블 시키도록 하겠습니다. 본 대회는 score 최적화를 위해 확률 값을 예측해야 하므로 soft vote ensemble을 진행하겠습니다.

In [12]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6941,0.7649,0.6322,0.7674,0.6933,0.3939,0.4012
1,0.6985,0.7716,0.6572,0.7591,0.7045,0.4,0.4043
2,0.689,0.7612,0.6348,0.7574,0.6907,0.3829,0.389
3,0.6878,0.7588,0.6367,0.754,0.6904,0.38,0.3856
4,0.6939,0.7729,0.6508,0.7555,0.6992,0.3911,0.3957
Mean,0.6927,0.7659,0.6424,0.7587,0.6956,0.3896,0.3951
SD,0.0039,0.0056,0.0098,0.0047,0.0054,0.0073,0.0071


### 모델 예측 (Prediction)
구축된 앙상블 모델을 통해 예측을 해보겠습니다.  
setup 환경에 이미 hold-out set이 존재하므로 해당 데이터에 대해 예측을 하여 모델 성능을 확인하겠습니다.

In [13]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.702,0.7718,0.6557,0.7658,0.7065,0.4076,0.4127


### 전체 데이터에 대한 재학습 (Re-training the model on whole data)
현재까지 실험은 주어진 train 데이터를 다시 한 번 train / validation으로 나눠서 실험을 한 것이므로, 전체 train 데이터에 학습되어 있지 않습니다.  
최적의 성능을 위해 전체 데이터에 학습을 시켜주도록 하겠습니다.

In [14]:
final_model = finalize_model(blended)

### 대회용 test set에 대한 예측 (Predicting on test set for the competition)
predict_model 함수를 통해 재학습된 모델을 대회용 test set에 대해 예측해보겠습니다.

In [15]:
predictions = predict_model(final_model, data = test)
predictions

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,Label,Score
0,0,3.0,736,2.0,2941,3.0,4621,1.0,4857,2.0,...,0,0,1,0,1,0,1,1,2,0.6481
1,1,3.0,514,2.0,1952,3.0,1552,3.0,821,4.0,...,0,0,0,0,0,0,0,0,2,0.8790
2,2,3.0,500,2.0,2507,4.0,480,2.0,614,2.0,...,0,1,1,0,1,0,1,1,1,0.4690
3,3,1.0,669,1.0,1050,5.0,1435,2.0,2252,5.0,...,1,1,1,1,1,1,1,1,1,0.2066
4,4,2.0,499,1.0,1243,5.0,845,2.0,1666,2.0,...,0,1,1,0,1,1,1,1,2,0.7522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11378,11378,5.0,427,5.0,1066,5.0,588,1.0,560,2.0,...,0,1,1,0,1,0,1,1,1,0.3822
11379,11379,1.0,314,5.0,554,5.0,230,1.0,956,2.0,...,1,1,1,1,1,1,1,1,2,0.8842
11380,11380,1.0,627,2.0,799,1.0,739,2.0,1123,1.0,...,0,1,1,0,1,0,1,1,1,0.2244
11381,11381,2.0,539,1.0,2090,2.0,4642,1.0,673,2.0,...,0,1,1,0,1,1,1,0,1,0.3184


In [17]:
submission['voted'] = predictions['Score']

In [18]:
submission.to_csv('submission_AutoML.csv', index = False)