In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
train  = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

pd.set_option('display.max_columns', None)

In [3]:
train.drop_duplicates(inplace=True, keep='last')

In [4]:
# 설문조사 항목 결측치 처리
# 앞 순번 설문조사한 사람의 숫자를 채워넣기로 함
train = train.fillna(method='ffill')

In [5]:
valid_columns = []
for i in range(1, 27):
  valid_columns.append('Q'+str(i))
  
valid_columns.append('nerdiness')

In [6]:
train = train[valid_columns]

In [7]:
train.isnull().sum()

Q1           0
Q2           0
Q3           0
Q4           0
Q5           0
Q6           0
Q7           0
Q8           0
Q9           0
Q10          0
Q11          0
Q12          0
Q13          0
Q14          0
Q15          0
Q16          0
Q17          0
Q18          0
Q19          0
Q20          0
Q21          0
Q22          0
Q23          0
Q24          0
Q25          0
Q26          0
nerdiness    0
dtype: int64

In [8]:
from pycaret.classification import *

model = setup(
    data = train,
    target = "nerdiness",
    use_gpu = True,
    train_size=0.95,
    session_id = 42
)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(15000, 27)"
5,Missing Values,False
6,Numeric Features,26
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
top_5_model = compare_models(
                sort="AUC",
                n_select=3, 
                exclude=['xgboost'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7881,0.8794,0.8373,0.7928,0.8144,0.5678,0.5692,0.758
rf,Random Forest Classifier,0.7867,0.8716,0.8347,0.7925,0.813,0.5653,0.5665,0.589
lightgbm,Light Gradient Boosting Machine,0.7385,0.8103,0.8028,0.746,0.7732,0.4654,0.4674,0.417
gbc,Gradient Boosting Classifier,0.7299,0.7987,0.7959,0.7384,0.766,0.4478,0.4497,1.351
lr,Logistic Regression,0.7201,0.7912,0.7916,0.7281,0.7585,0.4269,0.4291,0.071
lda,Linear Discriminant Analysis,0.7202,0.791,0.8025,0.7238,0.7611,0.4257,0.429,0.042
ada,Ada Boost Classifier,0.7208,0.7883,0.7803,0.7339,0.7564,0.4302,0.4314,0.324
nb,Naive Bayes,0.6998,0.767,0.7605,0.7166,0.7378,0.3875,0.3886,0.011
knn,K Neighbors Classifier,0.7013,0.7571,0.7806,0.7105,0.7438,0.3876,0.3902,0.715
qda,Quadratic Discriminant Analysis,0.686,0.7546,0.761,0.6999,0.7291,0.3572,0.359,0.048


In [10]:
blended = blend_models(estimator_list = top_5_model, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7811,0.8688,0.8274,0.7886,0.8075,0.554,0.5549
1,0.7842,0.8615,0.8489,0.7813,0.8137,0.5584,0.5611
2,0.7832,0.8691,0.8395,0.785,0.8114,0.5571,0.5588
3,0.7782,0.8691,0.825,0.7863,0.8052,0.5482,0.549
4,0.7733,0.8522,0.8375,0.773,0.804,0.5361,0.5385
Mean,0.78,0.8642,0.8357,0.7828,0.8084,0.5507,0.5525
Std,0.0039,0.0066,0.0087,0.0054,0.0037,0.0081,0.0081


In [11]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7936,0.8757,0.8619,0.7694,0.813,0.5844,0.5891


In [12]:
final_model = finalize_model(blended)

In [14]:
valid_columns.remove('nerdiness')
test = test[valid_columns]
test

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26
0,4.0,4.0,3.0,5.0,5.0,5.0,3.0,5.0,4.0,5.0,...,5.0,4.0,4.0,3.0,5.0,4.0,5.0,5.0,4.0,4.0
1,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,4.0,...,5.0,3.0,5.0,4.0,4.0,1.0,5.0,5.0,4.0,5.0
2,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,4.0,...,4.0,5.0,2.0,3.0,5.0,4.0,5.0,5.0,1.0,5.0
3,5.0,4.0,3.0,4.0,5.0,4.0,5.0,4.0,4.0,5.0,...,4.0,5.0,5.0,4.0,1.0,1.0,4.0,4.0,4.0,4.0
4,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,5.0,...,5.0,4.0,1.0,5.0,2.0,3.0,5.0,5.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35447,4.0,5.0,5.0,3.0,3.0,4.0,5.0,5.0,4.0,3.0,...,5.0,4.0,4.0,5.0,4.0,3.0,4.0,4.0,5.0,3.0
35448,5.0,5.0,5.0,5.0,5.0,4.0,5.0,3.0,5.0,3.0,...,5.0,5.0,5.0,5.0,5.0,2.0,4.0,4.0,3.0,5.0
35449,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,1.0,5.0,3.0,5.0,1.0,5.0,5.0,4.0,5.0
35450,5.0,5.0,4.0,5.0,5.0,1.0,5.0,1.0,5.0,5.0,...,5.0,1.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0


In [15]:
predictions = predict_model(final_model, data = test)