In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',1000)

In [2]:
# 데이터 불러오기 
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [3]:
train.columns

Index(['index', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10',
       'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')

In [4]:
# 기본 전처리 
train = train.drop('index', axis=1)
test = test.drop('index', axis=1)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 69 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            14959 non-null  float64
 1   Q2            14931 non-null  float64
 2   Q3            14950 non-null  float64
 3   Q4            14929 non-null  float64
 4   Q5            14962 non-null  float64
 5   Q6            14952 non-null  float64
 6   Q7            14924 non-null  float64
 7   Q8            14952 non-null  float64
 8   Q9            14944 non-null  float64
 9   Q10           14928 non-null  float64
 10  Q11           14941 non-null  float64
 11  Q12           14933 non-null  float64
 12  Q13           14960 non-null  float64
 13  Q14           14964 non-null  float64
 14  Q15           14955 non-null  float64
 15  Q16           14967 non-null  float64
 16  Q17           14963 non-null  float64
 17  Q18           14937 non-null  float64
 18  Q19           14947 non-nu

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35452 entries, 0 to 35451
Data columns (total 68 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            35333 non-null  float64
 1   Q2            35305 non-null  float64
 2   Q3            35322 non-null  float64
 3   Q4            35327 non-null  float64
 4   Q5            35362 non-null  float64
 5   Q6            35320 non-null  float64
 6   Q7            35310 non-null  float64
 7   Q8            35344 non-null  float64
 8   Q9            35348 non-null  float64
 9   Q10           35232 non-null  float64
 10  Q11           35333 non-null  float64
 11  Q12           35303 non-null  float64
 12  Q13           35356 non-null  float64
 13  Q14           35350 non-null  float64
 14  Q15           35345 non-null  float64
 15  Q16           35367 non-null  float64
 16  Q17           35373 non-null  float64
 17  Q18           35305 non-null  float64
 18  Q19           35355 non-nu

# lapase 관련 변수는 skew가 심하므로 로그변환 처리

In [7]:
# 로그 변환
train.introelapse = np.log1p(train.introelapse)
train.testelapse = np.log1p(train.testelapse)
train.surveyelapse = np.log1p(train.surveyelapse)

test.introelapse = np.log1p(test.introelapse)
test.testelapse = np.log1p(test.testelapse)
test.surveyelapse = np.log1p(test.surveyelapse)

# question 답변 안함과 nerdiness는 크게 상관이 없어보임

## 결론: Tipi를 답변하지 않을 수록 nerd가 아닐 확률이 높다...?

# << missing 처리 기준 >>
# question은 최빈값으로 채우기
# TIPI는 0으로 새로운 응답변수 생성

In [8]:
Q1 = train['Q1'].mode()
Q2 = train['Q2'].mode()
Q3 = train['Q3'].mode()
Q4 = train['Q4'].mode()
Q5 = train['Q5'].mode()
Q6 = train['Q6'].mode()
Q7 = train['Q7'].mode()
Q8 = train['Q8'].mode()
Q9 = train['Q9'].mode()
Q10 = train['Q10'].mode()
Q11 = train['Q11'].mode()
Q12 = train['Q12'].mode()
Q13 = train['Q13'].mode()
Q14 = train['Q14'].mode()
Q15 = train['Q15'].mode()
Q16 = train['Q16'].mode()
Q17 = train['Q17'].mode()
Q18 = train['Q18'].mode()
Q19 = train['Q19'].mode()
Q20 = train['Q20'].mode()
Q21 = train['Q21'].mode()
Q22 = train['Q22'].mode()
Q23 = train['Q23'].mode()
Q24 = train['Q24'].mode()
Q25 = train['Q25'].mode()
Q26 = train['Q26'].mode()

In [9]:
train['Q1'] = train['Q1'].fillna(Q1[0])
train['Q2'] = train['Q2'].fillna(Q2[0])
train['Q3'] = train['Q3'].fillna(Q3[0])
train['Q4'] = train['Q4'].fillna(Q4[0])
train['Q5'] = train['Q5'].fillna(Q5[0])
train['Q6'] = train['Q6'].fillna(Q6[0])
train['Q7'] = train['Q7'].fillna(Q7[0])
train['Q8'] = train['Q8'].fillna(Q8[0])
train['Q9'] = train['Q9'].fillna(Q9[0])
train['Q10'] = train['Q10'].fillna(Q10[0])
train['Q11'] = train['Q11'].fillna(Q11[0])
train['Q12'] = train['Q12'].fillna(Q12[0])
train['Q13'] = train['Q13'].fillna(Q13[0])
train['Q14'] = train['Q14'].fillna(Q14[0])
train['Q15'] = train['Q15'].fillna(Q15[0])
train['Q16'] = train['Q16'].fillna(Q16[0])
train['Q17'] = train['Q17'].fillna(Q17[0])
train['Q18'] = train['Q18'].fillna(Q18[0])
train['Q19'] = train['Q19'].fillna(Q19[0])
train['Q20'] = train['Q20'].fillna(Q20[0])
train['Q21'] = train['Q21'].fillna(Q21[0])
train['Q22'] = train['Q22'].fillna(Q22[0])
train['Q23'] = train['Q23'].fillna(Q23[0])
train['Q24'] = train['Q24'].fillna(Q24[0])
train['Q25'] = train['Q25'].fillna(Q25[0])
train['Q26'] = train['Q26'].fillna(Q26[0])

test['Q1'] = test['Q1'].fillna(Q1[0])
test['Q2'] = test['Q2'].fillna(Q2[0])
test['Q3'] = test['Q3'].fillna(Q3[0])
test['Q4'] = test['Q4'].fillna(Q4[0])
test['Q5'] = test['Q5'].fillna(Q5[0])
test['Q6'] = test['Q6'].fillna(Q6[0])
test['Q7'] = test['Q7'].fillna(Q7[0])
test['Q8'] = test['Q8'].fillna(Q8[0])
test['Q9'] = test['Q9'].fillna(Q9[0])
test['Q10'] = test['Q10'].fillna(Q10[0])
test['Q11'] = test['Q11'].fillna(Q11[0])
test['Q12'] = test['Q12'].fillna(Q12[0])
test['Q13'] = test['Q13'].fillna(Q13[0])
test['Q14'] = test['Q14'].fillna(Q14[0])
test['Q15'] = test['Q15'].fillna(Q15[0])
test['Q16'] = test['Q16'].fillna(Q16[0])
test['Q17'] = test['Q17'].fillna(Q17[0])
test['Q18'] = test['Q18'].fillna(Q18[0])
test['Q19'] = test['Q19'].fillna(Q19[0])
test['Q20'] = test['Q20'].fillna(Q20[0])
test['Q21'] = test['Q21'].fillna(Q21[0])
test['Q22'] = test['Q22'].fillna(Q22[0])
test['Q23'] = test['Q23'].fillna(Q23[0])
test['Q24'] = test['Q24'].fillna(Q24[0])
test['Q25'] = test['Q25'].fillna(Q25[0])
test['Q26'] = test['Q26'].fillna(Q26[0])

In [10]:
# 0으로 채워줌 (새로운 변수 처럼 작동하도록)
tipi  = ['TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10']
train[tipi] = train[tipi].fillna(0)
test[tipi] = test[tipi].fillna(0)

# 나이, 가족수 이상치 처리

#### 나이 답변을 이상하게 한 사람은은 nerd일까?

크게 상관없는듯 -> 결측치, 이상치(2999?)는 평균으로 채우기로 결정

In [11]:
def age_outlier(x):
    if x > 100:
        return round(train.age.mean())
    else: 
        return x

In [12]:
train.age = train.age.apply(age_outlier)
test.age = test.age.apply(age_outlier)

In [13]:
def age_grouping(x):
    if x <= 10:
        return 10
    elif x <= 20:
        return 20
    elif x <= 30:
        return 30
    elif x <= 40:
        return 40
    elif x <= 50:
        return 50
    elif x <= 60:
        return 60
    elif x <= 70:
        return 70
    elif x <= 80:
        return 80
    elif x <= 90:
        return 90
    elif x <= 100:
        return 100
    else: 
        return x

In [14]:
train.age = train.age.apply(age_grouping)
test.age = test.age.apply(age_grouping)

In [15]:
# familysize

In [16]:
train.familysize = train.familysize.apply(age_outlier)
test.familysize = test.familysize.apply(age_outlier)

In [17]:
train['familysize'] = train['familysize'].fillna(round(train.familysize.mean()))
test['familysize'] = test['familysize'].fillna(round(train.familysize.mean()))

# 신상정보를 답변하지 않은 것과 너드는 크게 상관이 없어보임 -> 최빈값으로 대체 처리 

# 신상정보(?) 변수 전처리

신상정보는 최빈값으로 넣음

In [18]:
edu = train['education'].mode()
gen = train['gender'].mode()
eng = train['engnat'].mode() 
han = train['hand'].mode()
ori = train['orientation'].mode()
vot = train['voted'].mode()
mar = train['married'].mode()
asd = train['ASD'].mode()
rel = train['religion'].mode()

In [19]:
train['education'] = train['education'].fillna(edu[0])
train['gender'] = train['gender'].fillna(gen[0])
train['engnat'] = train['engnat'].fillna(eng[0])
train['hand'] = train['hand'].fillna(han[0])
train['orientation'] = train['orientation'].fillna(ori[0])
train['voted'] = train['voted'].fillna(vot[0])
train['married'] = train['married'].fillna(mar[0])
train['ASD'] = train['ASD'].fillna(asd[0])
train['religion'] = train['religion'].fillna(rel[0])

test['education'] = test['education'].fillna(edu[0])
test['gender'] = test['gender'].fillna(gen[0])
test['engnat'] = test['engnat'].fillna(eng[0])
test['hand'] = test['hand'].fillna(han[0])
test['orientation'] = test['orientation'].fillna(ori[0])
test['voted'] = test['voted'].fillna(vot[0])
test['married'] = test['married'].fillna(mar[0])
test['ASD'] = test['ASD'].fillna(asd[0])
test['religion'] = test['religion'].fillna(rel[0])

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 69 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            15000 non-null  float64
 1   Q2            15000 non-null  float64
 2   Q3            15000 non-null  float64
 3   Q4            15000 non-null  float64
 4   Q5            15000 non-null  float64
 5   Q6            15000 non-null  float64
 6   Q7            15000 non-null  float64
 7   Q8            15000 non-null  float64
 8   Q9            15000 non-null  float64
 9   Q10           15000 non-null  float64
 10  Q11           15000 non-null  float64
 11  Q12           15000 non-null  float64
 12  Q13           15000 non-null  float64
 13  Q14           15000 non-null  float64
 14  Q15           15000 non-null  float64
 15  Q16           15000 non-null  float64
 16  Q17           15000 non-null  float64
 17  Q18           15000 non-null  float64
 18  Q19           15000 non-nu

# nerdiness랑 시각화

# 나라별로 조금씩 편차가 있는 것을 확인

# country

In [21]:
train.country.value_counts()

USA    7419
GBR    1109
CAN     915
AUS     525
DEU     473
PHL     261
BRA     240
IND     233
POL     210
FRA     208
ITA     186
NLD     175
SWE     152
MEX     136
FIN     133
NZL     124
MYS     121
IDN     110
NOR     106
SGP     102
ESP      94
DNK      91
ROU      87
IRL      79
ARG      74
PRT      73
TUR      70
RUS      63
CZE      62
HUN      56
BEL      55
GRC      54
ZAF      52
CHL      49
AUT      48
CHE      47
ISR      44
SRB      38
HRV      35
JPN      32
HKG      31
THA      30
BGR      30
ARE      28
PAK      27
UKR      22
SVN      21
KOR      21
LTU      20
LVA      20
SVK      19
VEN      17
COL      16
PRI      15
KEN      15
PER      15
URY      14
CRI      14
SAU      12
EST      12
VNM      12
IRN      11
BIH      11
TWN      11
EGY      10
JOR      10
HND       9
KWT       9
ALB       8
NGA       8
CHN       8
ISL       8
QAT       8
JAM       7
TTO       7
DOM       6
MKD       5
BGD       5
BHS       4
BHR       4
GEO       4
GHA       4
GUM       4
ECU 

In [22]:
# 100이하면 '기타'로처리
country_list = train.country.value_counts()[:40]
def func(country):
    if country in country_list:
        return country
    else:
        return '기타'

train['country'] = train['country'].apply(lambda x : func(x))
test['country'] = test['country'].apply(lambda x : func(x))

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 69 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            15000 non-null  float64
 1   Q2            15000 non-null  float64
 2   Q3            15000 non-null  float64
 3   Q4            15000 non-null  float64
 4   Q5            15000 non-null  float64
 5   Q6            15000 non-null  float64
 6   Q7            15000 non-null  float64
 7   Q8            15000 non-null  float64
 8   Q9            15000 non-null  float64
 9   Q10           15000 non-null  float64
 10  Q11           15000 non-null  float64
 11  Q12           15000 non-null  float64
 12  Q13           15000 non-null  float64
 13  Q14           15000 non-null  float64
 14  Q15           15000 non-null  float64
 15  Q16           15000 non-null  float64
 16  Q17           15000 non-null  float64
 17  Q18           15000 non-null  float64
 18  Q19           15000 non-nu

In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35452 entries, 0 to 35451
Data columns (total 68 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            35452 non-null  float64
 1   Q2            35452 non-null  float64
 2   Q3            35452 non-null  float64
 3   Q4            35452 non-null  float64
 4   Q5            35452 non-null  float64
 5   Q6            35452 non-null  float64
 6   Q7            35452 non-null  float64
 7   Q8            35452 non-null  float64
 8   Q9            35452 non-null  float64
 9   Q10           35452 non-null  float64
 10  Q11           35452 non-null  float64
 11  Q12           35452 non-null  float64
 12  Q13           35452 non-null  float64
 13  Q14           35452 non-null  float64
 14  Q15           35452 non-null  float64
 15  Q16           35452 non-null  float64
 16  Q17           35452 non-null  float64
 17  Q18           35452 non-null  float64
 18  Q19           35452 non-nu

In [25]:
train.columns

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')

In [26]:
train

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,country,introelapse,testelapse,surveyelapse,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness
0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,1.0,5.0,1.0,5.0,1.0,1.0,USA,1.386294,6.317165,1.945910,4.0,3.0,5.0,1.0,3.0,5.0,5.0,3.0,5.0,3.0,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,2.0,1,3.0,1.0,20,2.0,12.0,4.0,2.0,1.0,4.0,2.0,1
1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,3.0,1.0,4.0,5.0,3.0,1.0,2.0,4.0,5.0,1.0,3.0,1.0,1.0,5.0,3.0,2.0,5.0,USA,1.791759,4.454347,4.795791,4.0,2.0,3.0,5.0,3.0,2.0,5.0,1.0,2.0,2.0,1,1,1,1,1,0,1,0,0,1,0,0,1,1,1,1,4.0,2,2.0,1.0,50,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1
2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,4.0,2.0,5.0,5.0,5.0,1.0,3.0,5.0,3.0,5.0,2.0,2.0,1.0,2.0,4.0,2.0,5.0,NLD,2.302585,4.691348,4.615121,1.0,2.0,3.0,1.0,5.0,5.0,3.0,4.0,5.0,2.0,1,1,0,1,1,0,1,1,0,1,0,0,1,1,1,1,2.0,1,1.0,2.0,50,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1
3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,4.0,5.0,2.0,2.0,4.0,4.0,2.0,4.0,5.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,2.0,USA,1.098612,4.804021,4.941642,3.0,3.0,3.0,4.0,5.0,3.0,4.0,4.0,3.0,3.0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,1.0,3,1.0,1.0,20,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1
4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,4.0,4.0,4.0,3.0,5.0,5.0,2.0,4.0,1.0,4.0,2.0,4.0,2.0,3.0,4.0,4.0,4.0,ITA,1.386294,6.463029,5.379897,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,1,1.0,2,2.0,2.0,20,2.0,12.0,1.0,2.0,1.0,1.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2.0,5.0,4.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,1.0,4.0,4.0,3.0,4.0,4.0,2.0,5.0,2.0,4.0,1.0,2.0,5.0,4.0,2.0,4.0,USA,2.564949,4.644391,5.087596,2.0,2.0,4.0,3.0,3.0,5.0,3.0,3.0,3.0,3.0,1,1,1,1,1,0,0,0,0,1,0,1,1,1,1,1,2.0,2,2.0,1.0,20,1.0,1.0,3.0,2.0,1.0,3.0,2.0,0
14996,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,5.0,1.0,4.0,4.0,4.0,2.0,5.0,5.0,4.0,1.0,5.0,3.0,4.0,4.0,5.0,4.0,5.0,USA,3.465736,4.672829,5.192957,3.0,2.0,4.0,5.0,4.0,3.0,4.0,1.0,2.0,2.0,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,4.0,1,2.0,2.0,50,1.0,3.0,1.0,1.0,2.0,3.0,2.0,1
14997,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,2.0,5.0,5.0,3.0,4.0,USA,2.890372,4.644391,5.129899,1.0,3.0,2.0,5.0,1.0,5.0,3.0,3.0,1.0,1.0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,2.0,2,2.0,1.0,20,1.0,1.0,2.0,1.0,1.0,3.0,1.0,1
14998,5.0,5.0,4.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,3.0,5.0,4.0,4.0,1.0,5.0,4.0,5.0,5.0,2.0,5.0,3.0,5.0,3.0,3.0,5.0,USA,2.708050,4.234107,4.700480,1.0,1.0,3.0,5.0,4.0,5.0,5.0,4.0,2.0,1.0,1,1,1,1,1,0,0,1,0,1,0,1,1,1,1,1,3.0,2,2.0,1.0,30,1.0,12.0,4.0,2.0,2.0,2.0,1.0,0


In [27]:
train.engnat.value_counts()

1.0    9826
2.0    5174
Name: engnat, dtype: int64

In [28]:
# engnat / voted / ASD 
train['engnat'] = train['engnat'] - 1
train['voted'] = train['voted'] - 1
train['ASD'] = train['ASD'] - 1

test['engnat'] = test['engnat'] - 1
test['voted'] = test['voted'] - 1
test['ASD'] = test['ASD'] - 1

In [29]:
train.engnat.value_counts()

0.0    9826
1.0    5174
Name: engnat, dtype: int64

In [30]:
train.columns

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')

In [31]:
col = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'education',
       'urban', 'gender', 'age', 'hand', 'religion', 'orientation',
    'married']

In [32]:
train = pd.get_dummies(columns = col , data = train)
test = pd.get_dummies(columns = col, data = test)

In [33]:
train.columns

Index(['introelapse', 'testelapse', 'surveyelapse', 'VCL1', 'VCL2', 'VCL3',
       'VCL4', 'VCL5', 'VCL6', 'VCL7',
       ...
       'religion_11.0', 'religion_12.0', 'orientation_1.0', 'orientation_2.0',
       'orientation_3.0', 'orientation_4.0', 'orientation_5.0', 'married_1.0',
       'married_2.0', 'married_3.0'],
      dtype='object', length=298)

In [34]:
# 분류 작업에 필요한 함수 불러오기 
import jinja2
from pycaret.classification import*

# 실험환경 구축 (setup the environment)
- pycaret에서는 모델 학습 전 실험환경을 구축해야한다. Setup 함수를 통해 환경을 구축할 수 있다. 
setup 단계에서는 Pycaret이 자동으로 컬럼 형태를 인식한다. 그후 사용자에게 제대로 인식되었는지 
확인을 받게된다. 그때 enter를 눌러주면 된다. 
또한 주어진 데이터의 얼마를 사용하여 train/validation을 구축할지 묻게되는데, 
전체데이터를 사용하고 싶다면 enter를 눌러주면된다. 

In [121]:
clf = setup(data = train, target='nerdiness',fold_shuffle=True,
            session_id = 20210302,
            normalize = True,
             transformation = True, transformation_method='yeo-johnson',
            ignore_low_variance = True,
            combine_rare_levels = True, rare_level_threshold = 0.1,
            remove_multicollinearity = True, multicollinearity_threshold = 0.80,
           )

Unnamed: 0,Description,Value
0,session_id,20210302
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(15000, 298)"
5,Missing Values,False
6,Numeric Features,278
7,Categorical Features,19
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [35]:
# voted 컬럼이 예측대상이므로 target인자에 명시
clf = setup(data = train, target='nerdiness',fold_shuffle=True)

Unnamed: 0,Description,Value
0,session_id,7693
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(15000, 298)"
5,Missing Values,False
6,Numeric Features,278
7,Categorical Features,19
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

# 모델 학습 및 비교 
- 환경을 구축했으니 pycaret에서 제공하는 기본 모델에 대해 학습하고 비교해보겠다 
compared_models 함수를 통해 15개의 기본 모델을 학습하고 성능을 비교할 수 있다. 
auc가 기준으로 성능이 가장 좋은 3개의 모델을 추려내어 저장해보겠다. 
본 대회 평가 지표가 auc이기 때문에 auc 기준으로 모델을 선정합니다. 

In [112]:
best_3 = compare_models(sort='AUC',n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7627,0.8569,0.8193,0.7658,0.7915,0.517,0.5189,0.434
rf,Random Forest Classifier,0.767,0.8541,0.8281,0.7669,0.7962,0.5252,0.5276,0.233
catboost,CatBoost Classifier,0.751,0.8269,0.806,0.7571,0.7806,0.4934,0.495,0.93
xgboost,Extreme Gradient Boosting,0.7501,0.8221,0.7994,0.759,0.7786,0.4921,0.4932,4.889
lightgbm,Light Gradient Boosting Machine,0.7431,0.8193,0.8032,0.7482,0.7746,0.4768,0.4787,3.638
gbc,Gradient Boosting Classifier,0.7281,0.7998,0.7914,0.7346,0.7618,0.446,0.4479,0.535
lr,Logistic Regression,0.725,0.7965,0.7815,0.7351,0.7575,0.4406,0.4418,0.226
lda,Linear Discriminant Analysis,0.7238,0.7956,0.7833,0.7328,0.7571,0.4377,0.4392,0.143
ada,Ada Boost Classifier,0.7179,0.7892,0.7742,0.7294,0.751,0.4262,0.4274,0.13
nb,Naive Bayes,0.6799,0.7383,0.7749,0.6852,0.7269,0.3436,0.3483,0.043


In [113]:
best_1 = compare_models(sort='AUC',n_select=1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7627,0.8569,0.8193,0.7658,0.7915,0.517,0.5189,0.31
rf,Random Forest Classifier,0.767,0.8541,0.8281,0.7669,0.7962,0.5252,0.5276,0.242
catboost,CatBoost Classifier,0.751,0.8269,0.806,0.7571,0.7806,0.4934,0.495,0.907
xgboost,Extreme Gradient Boosting,0.7501,0.8221,0.7994,0.759,0.7786,0.4921,0.4932,3.836
lightgbm,Light Gradient Boosting Machine,0.7431,0.8193,0.8032,0.7482,0.7746,0.4768,0.4787,2.767
gbc,Gradient Boosting Classifier,0.7281,0.7998,0.7914,0.7346,0.7618,0.446,0.4479,0.623
lr,Logistic Regression,0.725,0.7965,0.7815,0.7351,0.7575,0.4406,0.4418,0.231
lda,Linear Discriminant Analysis,0.7238,0.7956,0.7833,0.7328,0.7571,0.4377,0.4392,0.105
ada,Ada Boost Classifier,0.7179,0.7892,0.7742,0.7294,0.751,0.4262,0.4274,0.157
nb,Naive Bayes,0.6799,0.7383,0.7749,0.6852,0.7269,0.3436,0.3483,0.054


In [114]:
best_2 = compare_models(sort='AUC',n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7627,0.8569,0.8193,0.7658,0.7915,0.517,0.5189,0.289
rf,Random Forest Classifier,0.767,0.8541,0.8281,0.7669,0.7962,0.5252,0.5276,0.22
catboost,CatBoost Classifier,0.751,0.8269,0.806,0.7571,0.7806,0.4934,0.495,0.93
xgboost,Extreme Gradient Boosting,0.7501,0.8221,0.7994,0.759,0.7786,0.4921,0.4932,3.894
lightgbm,Light Gradient Boosting Machine,0.7431,0.8193,0.8032,0.7482,0.7746,0.4768,0.4787,22.964
gbc,Gradient Boosting Classifier,0.7281,0.7998,0.7914,0.7346,0.7618,0.446,0.4479,0.475
lr,Logistic Regression,0.725,0.7965,0.7815,0.7351,0.7575,0.4406,0.4418,0.213
lda,Linear Discriminant Analysis,0.7238,0.7956,0.7833,0.7328,0.7571,0.4377,0.4392,0.098
ada,Ada Boost Classifier,0.7179,0.7892,0.7742,0.7294,0.751,0.4262,0.4274,0.12
nb,Naive Bayes,0.6799,0.7383,0.7749,0.6852,0.7269,0.3436,0.3483,0.039


In [115]:
best_3 = compare_models(sort='AUC',n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7627,0.8569,0.8193,0.7658,0.7915,0.517,0.5189,0.305
rf,Random Forest Classifier,0.767,0.8541,0.8281,0.7669,0.7962,0.5252,0.5276,0.254
catboost,CatBoost Classifier,0.751,0.8269,0.806,0.7571,0.7806,0.4934,0.495,1.011
xgboost,Extreme Gradient Boosting,0.7501,0.8221,0.7994,0.759,0.7786,0.4921,0.4932,3.68
lightgbm,Light Gradient Boosting Machine,0.7431,0.8193,0.8032,0.7482,0.7746,0.4768,0.4787,3.744
gbc,Gradient Boosting Classifier,0.7281,0.7998,0.7914,0.7346,0.7618,0.446,0.4479,0.542
lr,Logistic Regression,0.725,0.7965,0.7815,0.7351,0.7575,0.4406,0.4418,0.252
lda,Linear Discriminant Analysis,0.7238,0.7956,0.7833,0.7328,0.7571,0.4377,0.4392,0.111
ada,Ada Boost Classifier,0.7179,0.7892,0.7742,0.7294,0.751,0.4262,0.4274,0.128
nb,Naive Bayes,0.6799,0.7383,0.7749,0.6852,0.7269,0.3436,0.3483,0.047


In [116]:
best_4 = compare_models(sort='AUC',n_select=4)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7627,0.8569,0.8193,0.7658,0.7915,0.517,0.5189,0.318
rf,Random Forest Classifier,0.767,0.8541,0.8281,0.7669,0.7962,0.5252,0.5276,0.281
catboost,CatBoost Classifier,0.751,0.8269,0.806,0.7571,0.7806,0.4934,0.495,1.0
xgboost,Extreme Gradient Boosting,0.7501,0.8221,0.7994,0.759,0.7786,0.4921,0.4932,5.031
lightgbm,Light Gradient Boosting Machine,0.7431,0.8193,0.8032,0.7482,0.7746,0.4768,0.4787,3.244
gbc,Gradient Boosting Classifier,0.7281,0.7998,0.7914,0.7346,0.7618,0.446,0.4479,0.501
lr,Logistic Regression,0.725,0.7965,0.7815,0.7351,0.7575,0.4406,0.4418,0.228
lda,Linear Discriminant Analysis,0.7238,0.7956,0.7833,0.7328,0.7571,0.4377,0.4392,0.1
ada,Ada Boost Classifier,0.7179,0.7892,0.7742,0.7294,0.751,0.4262,0.4274,0.147
nb,Naive Bayes,0.6799,0.7383,0.7749,0.6852,0.7269,0.3436,0.3483,0.045


In [41]:
model_et = create_model('et', fold = 5)
model_et = tune_model(model_et, fold=5, optimize = 'AUC', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7305,0.8018,0.8478,0.7153,0.7759,0.4438,0.454
1,0.7405,0.8171,0.8408,0.7292,0.781,0.4664,0.4736
2,0.7257,0.7869,0.8382,0.7135,0.7709,0.4347,0.4435
3,0.7314,0.804,0.8365,0.7206,0.7742,0.4473,0.455
4,0.7213,0.779,0.8225,0.7143,0.7646,0.4272,0.4336
Mean,0.7299,0.7978,0.8372,0.7186,0.7733,0.4439,0.4519
SD,0.0064,0.0134,0.0083,0.0058,0.0055,0.0133,0.0133


In [43]:
model_rf = create_model('rf', fold = 5)
model_rf = tune_model(model_rf, fold=5, optimize = 'AUC', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7338,0.8054,0.8555,0.7161,0.7797,0.4501,0.4616
1,0.74,0.8186,0.8417,0.7283,0.7809,0.4653,0.4727
2,0.7276,0.7878,0.84,0.715,0.7725,0.4386,0.4475
3,0.7329,0.8042,0.8356,0.7225,0.775,0.4506,0.4578
4,0.7189,0.781,0.8208,0.7122,0.7627,0.4222,0.4287
Mean,0.7306,0.7994,0.8387,0.7188,0.7741,0.4454,0.4536
SD,0.0071,0.0134,0.0112,0.0058,0.0065,0.0143,0.0149


In [46]:
bag_et_10 = ensemble_model(model_et, n_estimators = 10, fold=5, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7757,0.8632,0.8547,0.7653,0.8075,0.5408,0.5455
1,0.7771,0.8647,0.8426,0.773,0.8063,0.5451,0.5479
2,0.7695,0.8473,0.8443,0.7625,0.8013,0.5286,0.5325
3,0.7686,0.8552,0.833,0.7667,0.7985,0.5278,0.5303
4,0.7661,0.8399,0.8433,0.7586,0.7987,0.5214,0.5256
Mean,0.7714,0.8541,0.8436,0.7652,0.8025,0.5327,0.5364
SD,0.0043,0.0094,0.0069,0.0048,0.0038,0.0088,0.0088


In [45]:
bag_et_50 = ensemble_model(model_et, n_estimators = 50, fold=5, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7786,0.8649,0.8538,0.7693,0.8093,0.547,0.5512
1,0.7776,0.8665,0.8469,0.7715,0.8074,0.5457,0.549
2,0.7743,0.8505,0.8503,0.7656,0.8057,0.5382,0.5424
3,0.7662,0.8592,0.8304,0.7649,0.7964,0.523,0.5254
4,0.7675,0.8419,0.8433,0.7603,0.7997,0.5245,0.5285
Mean,0.7728,0.8566,0.845,0.7663,0.8037,0.5357,0.5393
SD,0.0051,0.0092,0.0081,0.0038,0.0049,0.0102,0.0105


In [47]:
boo_et = ensemble_model(model_et, method = 'Boosting', fold=5, optimize='AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7624,0.8582,0.827,0.7618,0.793,0.5152,0.5176
1,0.7667,0.8592,0.8244,0.7685,0.7955,0.5246,0.5264
2,0.7586,0.8458,0.8261,0.7573,0.7902,0.5071,0.5098
3,0.77,0.8553,0.8149,0.7779,0.7959,0.5328,0.5335
4,0.7527,0.8385,0.813,0.756,0.7835,0.4962,0.498
Mean,0.7621,0.8514,0.8211,0.7643,0.7916,0.5152,0.5171
SD,0.0061,0.008,0.0059,0.0081,0.0046,0.0128,0.0125


In [50]:
bag_rf_10 = ensemble_model(model_rf, n_estimators = 10, fold=5, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.769,0.8532,0.8495,0.7595,0.802,0.527,0.5318
1,0.78,0.8591,0.8469,0.7745,0.8091,0.5508,0.5538
2,0.7686,0.838,0.8452,0.7609,0.8008,0.5265,0.5306
3,0.769,0.8481,0.8408,0.7636,0.8003,0.528,0.5314
4,0.758,0.8282,0.8364,0.7518,0.7918,0.5048,0.5089
Mean,0.7689,0.8453,0.8437,0.762,0.8008,0.5274,0.5313
SD,0.007,0.011,0.0046,0.0074,0.0055,0.0145,0.0142


In [49]:
bag_rf_50 = ensemble_model(model_rf, n_estimators = 50, fold=5, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7786,0.8559,0.8564,0.768,0.8098,0.5467,0.5514
1,0.7762,0.8607,0.8503,0.768,0.8071,0.5423,0.5463
2,0.7719,0.8394,0.8478,0.7638,0.8036,0.5334,0.5375
3,0.7705,0.8506,0.8391,0.7662,0.801,0.5312,0.5343
4,0.7642,0.8306,0.8416,0.757,0.797,0.5175,0.5216
Mean,0.7723,0.8474,0.847,0.7646,0.8037,0.5342,0.5382
SD,0.005,0.011,0.0062,0.0041,0.0045,0.0101,0.0103


In [48]:
boo_rf = ensemble_model(model_rf, method = 'Boosting', fold=5, optimize='AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.771,0.8581,0.8469,0.7631,0.8028,0.5314,0.5355
1,0.7762,0.8625,0.8322,0.7771,0.8037,0.5441,0.5459
2,0.7614,0.8477,0.8339,0.7573,0.7937,0.5124,0.5158
3,0.7643,0.8512,0.8166,0.7694,0.7923,0.5204,0.5216
4,0.7627,0.8333,0.8225,0.7643,0.7923,0.5165,0.5184
Mean,0.7671,0.8506,0.8304,0.7662,0.797,0.525,0.5274
SD,0.0056,0.0101,0.0104,0.0067,0.0052,0.0115,0.0115


In [53]:
blend_2_soft = blend_models(estimator_list=[model_rf, model_et], method='soft', fold=5, optimize='AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7762,0.8649,0.8495,0.7684,0.8069,0.5423,0.5462
1,0.7714,0.8631,0.8391,0.7674,0.8017,0.5333,0.5362
2,0.7762,0.8537,0.8434,0.7714,0.8058,0.543,0.546
3,0.77,0.8569,0.827,0.7716,0.7983,0.5315,0.5332
4,0.7685,0.8447,0.8381,0.764,0.7993,0.5271,0.5302
Mean,0.7725,0.8567,0.8394,0.7686,0.8024,0.5354,0.5384
SD,0.0032,0.0072,0.0074,0.0028,0.0034,0.0062,0.0066


In [57]:
stack_2_best = stack_models(estimator_list= best_3,
                         meta_model=model_et,
                           fold = 5,
                           optimize = 'AUC',
                           choose_better= True)

IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


OMP: Error #131: Thread identifier invalid.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGABRT(-6)}

In [58]:
best_2 = compare_models(sort='AUC',n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7675,0.8594,0.8271,0.7684,0.7966,0.5261,0.5283,0.331
rf,Random Forest Classifier,0.7704,0.8552,0.8334,0.769,0.7998,0.5316,0.5341,0.254
xgboost,Extreme Gradient Boosting,0.7622,0.8327,0.8093,0.7703,0.7893,0.5167,0.5177,3.897
catboost,CatBoost Classifier,0.7604,0.8317,0.8148,0.7652,0.7892,0.5122,0.5138,1.018
lightgbm,Light Gradient Boosting Machine,0.755,0.8259,0.8116,0.7599,0.7848,0.5012,0.5028,4.022
gbc,Gradient Boosting Classifier,0.738,0.8064,0.8024,0.7426,0.7712,0.4657,0.4678,0.559
lr,Logistic Regression,0.73,0.8015,0.782,0.7417,0.7612,0.451,0.452,0.675
lda,Linear Discriminant Analysis,0.7296,0.8005,0.7875,0.7387,0.7622,0.4495,0.4509,0.139
ada,Ada Boost Classifier,0.7273,0.7953,0.7782,0.74,0.7585,0.4458,0.4466,0.128
knn,K Neighbors Classifier,0.6828,0.7354,0.836,0.67,0.7437,0.3407,0.3554,0.248


In [73]:
best_3 = compare_models(sort='AUC',n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7669,0.8565,0.8188,0.7714,0.7943,0.526,0.5274,0.355
rf,Random Forest Classifier,0.7656,0.8521,0.8245,0.7669,0.7946,0.5225,0.5245,0.313
catboost,CatBoost Classifier,0.7529,0.8266,0.8075,0.7588,0.7823,0.4973,0.4987,0.924
xgboost,Extreme Gradient Boosting,0.7556,0.8234,0.802,0.765,0.7829,0.5036,0.5046,3.981
lightgbm,Light Gradient Boosting Machine,0.7458,0.818,0.8023,0.752,0.7763,0.4826,0.4842,3.186
gbc,Gradient Boosting Classifier,0.7256,0.7988,0.785,0.7343,0.7587,0.4414,0.443,0.59
lr,Logistic Regression,0.7248,0.7964,0.7807,0.7352,0.7572,0.4403,0.4415,0.312
lda,Linear Discriminant Analysis,0.7232,0.796,0.7808,0.7332,0.7562,0.4368,0.4381,0.095
ada,Ada Boost Classifier,0.7151,0.7872,0.7701,0.7277,0.7482,0.4208,0.4218,0.118
nb,Naive Bayes,0.6943,0.7643,0.7235,0.7213,0.7223,0.3822,0.3823,0.034


In [107]:
blended = blend_models(estimator_list = best_3, fold=5, method='soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7605,0.8492,0.8042,0.7701,0.7868,0.5138,0.5145
1,0.761,0.8481,0.8362,0.7551,0.7936,0.5114,0.5152
2,0.7633,0.8549,0.8052,0.7737,0.7891,0.5197,0.5203
3,0.7829,0.8635,0.8459,0.7785,0.8108,0.5571,0.5597
4,0.7737,0.8529,0.8354,0.7718,0.8023,0.5387,0.541
Mean,0.7683,0.8537,0.8254,0.7698,0.7965,0.5281,0.5301
SD,0.0087,0.0055,0.0173,0.0079,0.0089,0.0173,0.0177


In [117]:
blended_1 = blend_models(estimator_list = best_1, fold=5, method='soft')
pred_holdout = predict_model(blended_1)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.761,0.8484,0.8016,0.7721,0.7866,0.5151,0.5156
1,0.7557,0.8444,0.825,0.7538,0.7878,0.5014,0.5042
2,0.7557,0.8468,0.7965,0.7679,0.782,0.5044,0.5049
3,0.7605,0.8555,0.826,0.7596,0.7914,0.5114,0.5138
4,0.7613,0.8497,0.825,0.761,0.7917,0.5133,0.5156
Mean,0.7588,0.849,0.8148,0.7629,0.7879,0.5091,0.5108
SD,0.0026,0.0037,0.013,0.0065,0.0036,0.0053,0.0052


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7852,0.8649,0.8424,0.7896,0.8151,0.5593,0.561


In [118]:
blended_2 = blend_models(estimator_list = best_2, fold=5, method='soft')
pred_holdout = predict_model(blended_2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.76,0.8511,0.8076,0.7677,0.7872,0.5125,0.5133
1,0.7633,0.8485,0.838,0.7572,0.7956,0.5163,0.52
2,0.7576,0.8498,0.8052,0.7661,0.7851,0.5075,0.5084
3,0.7648,0.8594,0.8355,0.7604,0.7962,0.5195,0.5227
4,0.7661,0.853,0.8362,0.7616,0.7972,0.5223,0.5255
Mean,0.7624,0.8524,0.8245,0.7626,0.7923,0.5156,0.518
SD,0.0031,0.0038,0.0148,0.0038,0.005,0.0052,0.0063


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7852,0.8662,0.8451,0.7881,0.8156,0.559,0.561


In [122]:
blended_3 = blend_models(estimator_list = best_3, fold=5, method='soft')
pred_holdout = predict_model(blended_3)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7605,0.8492,0.8042,0.7701,0.7868,0.5138,0.5145
1,0.761,0.8481,0.8362,0.7551,0.7936,0.5114,0.5152
2,0.7633,0.8549,0.8052,0.7737,0.7891,0.5197,0.5203
3,0.7829,0.8635,0.8459,0.7785,0.8108,0.5571,0.5597
4,0.7737,0.8529,0.8354,0.7718,0.8023,0.5387,0.541
Mean,0.7683,0.8537,0.8254,0.7698,0.7965,0.5281,0.5301
SD,0.0087,0.0055,0.0173,0.0079,0.0089,0.0173,0.0177


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7907,0.8683,0.8451,0.7955,0.8195,0.571,0.5725


In [120]:
blended_4 = blend_models(estimator_list = best_4, fold=5, method='soft')
pred_holdout = predict_model(blended_4)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7605,0.8481,0.799,0.7728,0.7857,0.5144,0.5148
1,0.7633,0.8472,0.8302,0.7609,0.794,0.5171,0.5199
2,0.7581,0.8505,0.8104,0.7641,0.7866,0.508,0.5092
3,0.7829,0.862,0.845,0.7789,0.8106,0.5572,0.5597
4,0.7804,0.8515,0.838,0.7792,0.8075,0.5526,0.5546
Mean,0.769,0.8519,0.8245,0.7712,0.7969,0.5299,0.5316
SD,0.0104,0.0053,0.0172,0.0075,0.0104,0.0207,0.0212


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.786,0.8644,0.8345,0.7952,0.8143,0.5623,0.5632


In [108]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7907,0.8683,0.8451,0.7955,0.8195,0.571,0.5725


In [None]:
0.8683 = 0.80

In [None]:
0.8671 = 0.95

# 전체 데이터에 대한 재학습
현재까지 실험은 주어진 train 데이터를 다시 한번 train/validaton으로 나눠서 실험을 한것으로, 전체 train에 대해 학습되어 있지 않습니다. 
최적의 성능을 위해 전체 데이터에 학습을 시켜주도록 하겠다. 

In [123]:
final_model = finalize_model(blended_3)

In [124]:
predictions = predict_model(final_model, data=test)

In [125]:
predictions

Unnamed: 0,introelapse,testelapse,surveyelapse,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,...,orientation_1.0,orientation_2.0,orientation_3.0,orientation_4.0,orientation_5.0,married_1.0,married_2.0,married_3.0,Label,Score
0,2.302585,4.744932,5.497168,1,1,1,1,1,1,1,...,0,0,0,1,0,1,0,0,0,0.7224
1,2.944439,4.682131,5.332719,1,1,1,1,1,0,0,...,0,0,0,0,1,1,0,0,1,0.7828
2,1.609438,4.477337,5.257495,1,1,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,0.8371
3,7.787382,5.049856,5.556828,1,1,1,1,1,1,1,...,0,1,0,0,0,1,0,0,1,0.6574
4,3.218876,5.099866,5.669881,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,0.8100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35447,2.397895,5.303305,5.541264,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0.8596
35448,6.224558,4.919981,5.049856,1,0,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,0.8043
35449,3.332205,4.804021,5.257495,1,1,1,1,1,0,0,...,0,0,0,0,1,1,0,0,1,0.9145
35450,1.386294,4.077537,4.663439,1,1,0,1,1,1,0,...,1,0,0,0,0,1,0,0,0,0.8163


In [126]:
submission['nerdiness'] = predictions['Label']
submission.to_csv('submission_220820_3.csv', index=False)

In [127]:
submission

Unnamed: 0,index,nerdiness
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
35447,35447,1
35448,35448,1
35449,35449,1
35450,35450,0


In [None]:
#  Q끼리 pca 한번 해보기 -> 추려진 Q 변수로만 돌려보기 
