In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
train  = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

pd.set_option('display.max_columns', None)

In [3]:
# # 설문조사 항목 결측치 처리
# # 앞 순번 설문조사한 사람의 숫자를 채워넣기로 함
# train = train.fillna(method='ffill')
# test = test.fillna(method='ffill')

In [4]:
train.drop_duplicates(inplace=True, keep='last')

In [5]:
# 값이 너무 커서 log 처리
train['intro_log'] = np.log(train['introelapse'])
train['test_log'] = np.log(train['testelapse'])
train['survey_log'] = np.log(train['surveyelapse'])

test['intro_log'] = np.log(test['introelapse'])
test['test_log'] = np.log(test['testelapse'])
test['survey_log'] = np.log(test['surveyelapse'])

In [6]:
train.drop(['country', 'index'], axis=1, inplace=True)
test.drop(['country', 'index'], axis=1, inplace=True)

In [7]:
for i,j in train['age'].iteritems():
    if j>100:
        train.loc[i, 'age'] = 100
        
for i,j in test['age'].iteritems():
    if j>100:
        test.loc[i, 'age'] = 100
        
for i,j in train['familysize'].iteritems():
    if j>10:
        train.loc[i, 'familysize'] = 10
        
for i,j in test['familysize'].iteritems():
    if j>10:
        test.loc[i, 'familysize'] = 10

In [8]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(3513, 7731)

In [9]:
val_col = train.drop('nerdiness', axis=1).columns

In [10]:
from sklearn.preprocessing import MinMaxScaler

train[:] = np.nan_to_num(train)
test[:] = np.nan_to_num(test)

scaler = MinMaxScaler()

train[val_col] = scaler.fit_transform(train[val_col])
test[val_col] = scaler.transform(test[val_col])

In [31]:
from pycaret.classification import *

model = setup(
    data = train,
    target = "nerdiness",
    use_gpu = True,
    train_size=0.90,
    session_id = 2022
)

Unnamed: 0,Description,Value
0,session_id,2022
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 71)"
5,Missing Values,False
6,Numeric Features,54
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


In [30]:
# val 5%
top_3_model = compare_models(
                sort="AUC",
                n_select=3, 
                exclude=['xgboost'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7924,0.8849,0.8444,0.7935,0.8181,0.5769,0.5785,0.835
rf,Random Forest Classifier,0.7911,0.8794,0.8423,0.7929,0.8168,0.5743,0.5758,0.662
lightgbm,Light Gradient Boosting Machine,0.7532,0.825,0.8114,0.7591,0.7843,0.4968,0.4984,0.474
gbc,Gradient Boosting Classifier,0.7332,0.8057,0.7965,0.7407,0.7675,0.4554,0.4572,3.469
lr,Logistic Regression,0.7255,0.7946,0.7923,0.7329,0.7613,0.4393,0.4414,0.351
lda,Linear Discriminant Analysis,0.7248,0.7942,0.8027,0.7277,0.7633,0.4364,0.4396,0.085
ada,Ada Boost Classifier,0.7229,0.7941,0.7784,0.7359,0.7565,0.4357,0.4367,0.694
nb,Naive Bayes,0.6525,0.7405,0.5391,0.7635,0.6308,0.3209,0.3389,0.013
knn,K Neighbors Classifier,0.6838,0.7368,0.7724,0.6917,0.7297,0.3516,0.3549,0.851
qda,Quadratic Discriminant Analysis,0.5821,0.7189,0.3049,0.8364,0.4424,0.2141,0.2847,0.057


In [33]:
blended = blend_models(estimator_list = top_3_model, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7881,0.8633,0.8336,0.7931,0.8128,0.5691,0.5701
1,0.7748,0.8541,0.8242,0.7802,0.8016,0.5417,0.5428
2,0.7967,0.8695,0.8577,0.7913,0.8232,0.5849,0.5875
3,0.7707,0.8498,0.8341,0.7695,0.8005,0.5321,0.5345
4,0.7696,0.8537,0.8428,0.7638,0.8014,0.5287,0.5324
Mean,0.78,0.8581,0.8385,0.7796,0.8079,0.5513,0.5534
Std,0.0106,0.0072,0.0113,0.0116,0.0089,0.022,0.0217


In [34]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7833,0.8551,0.8374,0.7938,0.815,0.554,0.5552


In [35]:
final_model = finalize_model(blended)

In [21]:
predictions = predict_model(final_model, data = test)

In [22]:
submission = pd.read_csv('./data/sample_submission.csv')

In [27]:
submission['nerdiness'] = predictions['Label']


In [28]:
submission.to_csv('전처리두번째.csv', index=False)