# Library

In [None]:
!pip install pycaret

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from pycaret.classification import *
from sklearn.metrics import log_loss
import random

# Data Load & Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = "/content/drive/MyDrive/dacon/credit_card"
train = pd.read_csv(PATH+'/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv(PATH+'/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv(PATH+'/sample_submission.csv')

# Pycaret

In [None]:
from pycaret.classification import *
from sklearn.metrics import log_loss

clf = setup(train, target = 'credit', train_size = 0.85)

In [None]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function                   <function log_loss at 0x7f4d38f7fd40>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [None]:
#svm, ridge는 predict_proba 미지원으로 제외
best5 = compare_models(fold = 5, sort = 'logloss', n_select = 5, exclude=['svm','ridge'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6966,0.7127,0.4275,0.6885,0.6211,0.2395,0.3199,0.762,0.884
gbc,Gradient Boosting Classifier,0.6908,0.6607,0.4101,0.6625,0.6054,0.2109,0.3017,0.7957,10.274
lda,Linear Discriminant Analysis,0.6403,0.6089,0.338,0.5232,0.5085,0.0135,0.0475,0.8648,0.272
nb,Naive Bayes,0.6402,0.6064,0.334,0.5406,0.5003,0.0023,0.0258,0.8669,0.088
lr,Logistic Regression,0.6397,0.5727,0.3333,0.4092,0.4992,0.0,0.0,0.8752,1.116
rf,Random Forest Classifier,0.6986,0.7492,0.5423,0.6781,0.6837,0.3751,0.3802,1.0073,3.392
ada,Ada Boost Classifier,0.6893,0.6261,0.4043,0.618,0.5994,0.1987,0.2995,1.0807,1.018
et,Extra Trees Classifier,0.6685,0.7067,0.5198,0.649,0.6552,0.318,0.3216,2.6379,3.544
knn,K Neighbors Classifier,0.6221,0.6681,0.4768,0.6039,0.6109,0.2336,0.2353,3.9125,0.83
dt,Decision Tree Classifier,0.6005,0.6345,0.4884,0.612,0.6058,0.2495,0.2498,13.381,0.288


In [None]:
blended = blend_models(estimator_list = best5, fold = 5, optimize = 'logloss')
pred_holdout = predict_model(blended)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.6888,0.6674,0.4032,0.6194,0.5979,0.1955,0.2988,0.8154
1,0.6934,0.6637,0.4097,0.6293,0.6054,0.2099,0.3155,0.8126
2,0.691,0.6749,0.4061,0.6217,0.6014,0.203,0.3064,0.8114
3,0.6882,0.6736,0.402,0.6137,0.597,0.195,0.2957,0.8167
4,0.6887,0.6584,0.4022,0.6205,0.5973,0.1933,0.2982,0.8211
Mean,0.69,0.6676,0.4046,0.6209,0.5998,0.1993,0.3029,0.8154
SD,0.002,0.0062,0.0029,0.005,0.0032,0.0063,0.0073,0.0034


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.6989,0,0.4056,0.6273,0.6102,0.2008,0.3049,0


In [None]:
#lda 와 nb는 losloss가 좋지만 accuracy, auc가 상대적으로 좋지 않아 커스텀모델 생성
custom_model = compare_models(fold = 5, sort = 'logloss', n_select = 6, include=['lightgbm','gbc','rf','ada', 'lda', 'nb'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6966,0.7127,0.4275,0.6885,0.6211,0.2395,0.3199,0.762,0.904
gbc,Gradient Boosting Classifier,0.6908,0.6607,0.4101,0.6625,0.6054,0.2109,0.3017,0.7957,10.334
lda,Linear Discriminant Analysis,0.6403,0.6089,0.338,0.5232,0.5085,0.0135,0.0475,0.8648,0.272
nb,Naive Bayes,0.6402,0.6064,0.334,0.5406,0.5003,0.0023,0.0258,0.8669,0.088
rf,Random Forest Classifier,0.6986,0.7492,0.5423,0.6781,0.6837,0.3751,0.3802,1.0073,3.422
ada,Ada Boost Classifier,0.6893,0.6261,0.4043,0.618,0.5994,0.1987,0.2995,1.0807,1.044


In [None]:
blended_custom = blend_models(estimator_list = custom_model, fold = 5, optimize = 'logloss')
pred_holdout_custom = predict_model(blended_custom)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.6914,0.7432,0.4107,0.7085,0.6057,0.2106,0.305,0.7853
1,0.6992,0.7419,0.422,0.7306,0.6176,0.2335,0.3326,0.7823
2,0.695,0.749,0.4151,0.6932,0.6109,0.2227,0.3168,0.7825
3,0.6911,0.7424,0.4089,0.733,0.6044,0.2107,0.3028,0.7857
4,0.6942,0.7306,0.412,0.7432,0.6082,0.2166,0.3144,0.7888
Mean,0.6942,0.7414,0.4137,0.7217,0.6093,0.2188,0.3143,0.7849
SD,0.0029,0.006,0.0046,0.0182,0.0047,0.0086,0.0106,0.0024


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.7045,0.7493,0.4146,0.6629,0.6199,0.2226,0.3232,0.7725


# Prediction

In [None]:
#자동으로 상위 5개 선택한 모델
final_model = finalize_model(blended)

#Accurary, AUC, Logloss 셋다 상위 4개인 모델 
final_model_custom = finalize_model(blended_custom)

In [None]:
#Accurary, AUC, Logloss 셋다 상위 4개인 모델 사용
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model_custom])
prections = prep_pipe.predict_proba(test)
prections

array([[0.1376445 , 0.2152681 , 0.6470874 ],
       [0.30306087, 0.19635073, 0.5005884 ],
       [0.13676892, 0.1755953 , 0.68763579],
       ...,
       [0.11744462, 0.15375159, 0.72880379],
       [0.2601363 , 0.23328373, 0.50657997],
       [0.15837601, 0.2577552 , 0.58386879]])

In [None]:
i, j, k = [], [], []
for row in prections:
  i.append(row[0])
  j.append(row[1])
  k.append(row[2])

len(i), len(j), len(k)

(10000, 10000, 10000)

In [None]:
submit['0'] = i
submit['1'] = j
submit['2'] = k
submit.head(10)

Unnamed: 0,index,0,1,2
0,26457,0.137644,0.215268,0.647087
1,26458,0.303061,0.196351,0.500588
2,26459,0.136769,0.175595,0.687636
3,26460,0.138452,0.164291,0.697257
4,26461,0.157905,0.222059,0.620036
5,26462,0.148207,0.200451,0.651343
6,26463,0.346082,0.58858,0.065338
7,26464,0.154869,0.185122,0.660009
8,26465,0.131738,0.191057,0.677205
9,26466,0.131462,0.2677,0.600837


In [None]:
submit.to_csv(PATH+"\submisison.csv",index=False)