In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

In [22]:
data = pd.read_csv('data.csv', sep = '\t')
print(data.shape)

(73489, 105)


In [23]:
data.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,screenw,screenh,hand,religion,orientation,race,voted,married,familysize,major
0,3.0,6.0,21017.0,3.0,7.0,18600.0,5.0,20.0,14957.0,2.0,...,1440.0,900.0,1,7,1,30,1,2,5,Marketing
1,5.0,17.0,3818.0,5.0,9.0,7850.0,1.0,16.0,5902.0,3.0,...,1536.0,864.0,1,1,1,60,2,1,2,mathematics
2,5.0,16.0,4186.0,5.0,12.0,2900.0,1.0,2.0,7160.0,1.0,...,375.0,667.0,1,2,2,10,2,1,2,Chemistry
3,2.0,12.0,9373.0,4.0,1.0,10171.0,2.0,7.0,10117.0,1.0,...,1280.0,720.0,1,6,1,60,1,3,2,international relations
4,5.0,13.0,9465.0,5.0,7.0,5284.0,2.0,19.0,8872.0,1.0,...,360.0,640.0,1,4,3,60,1,1,2,Management


In [7]:
# Q_A : 유저의 답
# Q_I : 아이템의 위치
# Q_E : 문항 응답 시간


In [65]:
# country 변수에서 US만 사용
data_pre = data[data['country']=='US']
data_pre = data_pre.drop('country', axis = 1)

# vote에서 0[이상치]을 제거
# 1 = yes, 2 = no
data_pre = data_pre[data_pre['voted']!=0]

# major 변수 제거
data_pre = data_pre.drop('major', axis = 1)

In [66]:
data_pre.shape

(33705, 103)

In [14]:
models_list = [x for x in data.columns if not x.endswith("I")]
models_list

['Q1A',
 'Q1E',
 'Q2A',
 'Q2E',
 'Q3A',
 'Q3E',
 'Q4A',
 'Q4E',
 'Q5A',
 'Q5E',
 'Q6A',
 'Q6E',
 'Q7A',
 'Q7E',
 'Q8A',
 'Q8E',
 'Q9A',
 'Q9E',
 'Q10A',
 'Q10E',
 'Q11A',
 'Q11E',
 'Q12A',
 'Q12E',
 'Q13A',
 'Q13E',
 'Q14A',
 'Q14E',
 'Q15A',
 'Q15E',
 'Q16A',
 'Q16E',
 'Q17A',
 'Q17E',
 'Q18A',
 'Q18E',
 'Q19A',
 'Q19E',
 'Q20A',
 'Q20E',
 'country',
 'introelapse',
 'testelapse',
 'surveyelapse',
 'TIPI1',
 'TIPI2',
 'TIPI3',
 'TIPI4',
 'TIPI5',
 'TIPI6',
 'TIPI7',
 'TIPI8',
 'TIPI9',
 'TIPI10',
 'VCL1',
 'VCL2',
 'VCL3',
 'VCL4',
 'VCL5',
 'VCL6',
 'VCL7',
 'VCL8',
 'VCL9',
 'VCL10',
 'VCL11',
 'VCL12',
 'VCL13',
 'VCL14',
 'VCL15',
 'VCL16',
 'education',
 'urban',
 'gender',
 'engnat',
 'age',
 'screenw',
 'screenh',
 'hand',
 'religion',
 'orientation',
 'race',
 'voted',
 'married',
 'familysize',
 'major']

In [18]:
data['country'].value_counts()

US    33959
GB     5487
CA     4761
AU     2734
IN     1621
      ...  
GP        1
ML        1
MQ        1
CI        1
TD        1
Name: country, Length: 187, dtype: int64

In [67]:
# voting이 반응변수로 괜찮은지 실험
train, test = train_test_split(data_pre, test_size = 0.2, random_state=8)

In [68]:
train_x = train.drop('voted', axis = 1)
train_y = train['voted']
test_x = test.drop('voted', axis = 1)
test_y = test['voted']

In [69]:
train_x

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,engnat,age,screenw,screenh,hand,religion,orientation,race,married,familysize
13689,1.0,12.0,10001.0,1.0,7.0,5853.0,4.0,6.0,16224.0,4.0,...,1,34,1920.0,1080.0,1,5,1,60,2,2
36109,1.0,9.0,7883.0,3.0,12.0,8421.0,4.0,5.0,9066.0,5.0,...,1,44,1024.0,768.0,1,6,1,60,2,2
20715,3.0,18.0,2829.0,1.0,13.0,4527.0,4.0,4.0,5696.0,2.0,...,1,24,1366.0,768.0,3,2,2,60,1,2
23154,2.0,5.0,5967.0,4.0,18.0,6928.0,5.0,6.0,4852.0,4.0,...,1,32,1280.0,800.0,1,1,1,70,2,1
25202,1.0,15.0,3576.0,2.0,12.0,11189.0,2.0,14.0,9826.0,1.0,...,1,15,1366.0,768.0,1,9,1,60,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11700,1.0,9.0,10217.0,2.0,13.0,5649.0,1.0,10.0,9587.0,4.0,...,1,57,1366.0,768.0,2,1,4,60,1,1
19778,3.0,16.0,8700.0,5.0,6.0,3902.0,3.0,8.0,46254.0,2.0,...,1,41,320.0,568.0,1,4,1,60,2,4
37103,5.0,14.0,12457.0,4.0,16.0,9237.0,5.0,2.0,15495.0,2.0,...,1,39,414.0,736.0,1,1,1,60,2,2
54398,4.0,15.0,5807.0,4.0,13.0,4010.0,2.0,4.0,5040.0,3.0,...,1,18,375.0,667.0,3,12,2,60,1,2


In [70]:
# lgbm으로 분석
model = lgbm.LGBMClassifier(n_estimators=500)
model.fit(train_x, train_y)

LGBMClassifier(n_estimators=500)

In [72]:
pred_y = model.predict(test_x)

In [74]:
confusion_matrix(test_y, pred_y)

array([[2510,  620],
       [1226, 2385]], dtype=int64)

In [76]:
accuracy_score(test_y, pred_y)

0.7261533897047916