In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv('raw_data.csv', sep = '\t')

In [3]:
# country 변수에서 US만 사용
data_pre = data[data['country'].isin(['US','GB','CA','AU','IN','HU','DE','PH','NL','CN','PL','FR','ID','RO','ZA','NO','MY','SG'])]
data_pre = data_pre.drop('country', axis = 1)

# vote에서 0[이상치]을 제거
# 1 = yes, 2 = no
data_pre = data_pre[data_pre['voted']!=0]

# major, screenw, screenh, introelapse, testelapse,surveyelapse, orientation 변수 제거
data_pre = data_pre.drop(['major','screenw','screenh','introelapse','testelapse','surveyelapse','orientation'], axis = 1)

# gender에서 3(other) 제거 및 문자화
data_pre = data_pre[data_pre['gender']!=3]
gender_name = {1:'Male', 2:'Female'}
data_pre['gender'] = data_pre['gender'].map(gender_name)

# Q_I : 질문의 위치를 나타내는 변수 제거
not_remove_list = [x for x in data_pre.columns if not x.endswith("I")]
data_pre = data_pre[not_remove_list]

# 나이 범주화
data_pre = data_pre[(data_pre['age']<120) & (data_pre['age']>10)]
data_pre['age_group'] = pd.cut(data_pre['age'], bins = [10,20,30,40,50,60,70,120], labels= ['10s','20s','30s','40s','50s','60s','+70s'])
data_pre = data_pre.drop('age', axis =1)

# religion 변수 문자화
religion_name = {1:'Agnostic', 2:'Atheist', 3:'Buddhist', 4:'Christian_Catholic', 5:'Christian_Mormon', 6:'Christian_Protestant', 7:'Christian_Other', 8:'Hindu', 9:'Jewish', 10:'Muslim', 11:'Sikh', 12:'Other'}
data_pre['religion'] = data_pre['religion'].map(religion_name)

# race 변수 문자화
race_name = {10:'Asian', 20:'Arab', 30:'Black', 40:'Indigenous Australian', 50:'Native American', 60:'White', 70:'Other'}
data_pre['race'] = data_pre['race'].map(race_name)

# TIPI 척도 및 변수명 변경 (1~7 → 6~0)
tipi_list = ['TIPI1','TIPI2','TIPI3','TIPI4','TIPI5','TIPI6','TIPI7','TIPI8','TIPI9','TIPI10']
tipi_new0 = ['tp01','tp02','tp03','tp04','tp05','tp06','tp07','tp08','tp09','tp10']
for i in tipi_list:
  data_pre[i] = data_pre[i].map(lambda x: 7-x)

data_pre.rename(columns = dict(zip(tipi_list,tipi_new0)), inplace=True)


# VCL 변수명 변경(변수명 공개x)
## 존재하는 단어 : wr01~13
## 허구의 단어 : wf01~03
col_names = list(data_pre.columns)
VCL_list = [x for x in data_pre.columns if x.startswith("VCL")]
FK_list = ['wr_02','wr_05','wr_11','wr_10',
           'wr_13','wf_03','wr_03','wr_06',
           'wf_01','wr_12','wr_09','wf_02',
           'wr_01','wr_04','wr_07','wr_08']
data_pre.rename(columns = dict(zip(VCL_list,FK_list)), inplace=True)

# Q_E 단위 변경
## 7 나눈다음 버림 
E_names = [x for x in data.columns if x.startswith("Q") and x.endswith('E')]
data_pre[E_names] = data_pre[E_names]/7
data_pre[E_names] = data_pre[E_names].fillna(0).applymap(lambda x: max(x, 0)).astype(int)


# Q_A 및 Q_E 변수명 및 순서 변경
## 1~20의 숫자를 a~t로 변경
random.seed(8)
alphabet_20 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t']
random.shuffle(alphabet_20)
print(alphabet_20)

A_list = []
E_list = []

for i in alphabet_20:
  temp1 = 'Q{}A'.format(i)
  temp2 = 'Q{}E'.format(i)
  A_list.append(temp1)
  E_list.append(temp2)

A_names = [x for x in data.columns if x.startswith("Q") and x.endswith('A')]
E_names = [x for x in data.columns if x.startswith("Q") and x.endswith('E')]

data_pre.rename(columns = dict(zip(A_names,A_list)), inplace=True)
data_pre.rename(columns = dict(zip(E_names,E_list)), inplace=True)

# 결측치 제거
data_pre = data_pre.dropna(axis=0)


# 데이터를 변수명으로 정렬
data_pre = data_pre.reindex(sorted(data_pre.columns), axis=1)

#출력과 결과가 같아야함.
#['t', 'o', 'r', 'q', 'j', 'n', 'f', 'p', 'k', 'i', 'd', 'c', 'b', 'a', 's', 'g', 'e', 'm', 'l', 'h']
print('preprocessing is done')

['t', 'o', 'r', 'q', 'j', 'n', 'f', 'p', 'k', 'i', 'd', 'c', 'b', 'a', 's', 'g', 'e', 'm', 'l', 'h']
preprocessing is done


In [4]:
# 최종 데이터셋 제작
train, test = train_test_split(data_pre, test_size = 0.2, random_state=8)

In [5]:
train = train.reset_index(drop=True).reset_index(drop=False)
test = test.reset_index(drop=True).reset_index(drop=False)

In [6]:
print(train.shape, test.shape, sep='/')

(45532, 78)/(11383, 78)


In [7]:
test_x = test.drop('voted', axis = 1)
test_answer = test[['index','voted']]
public_y = test_answer[:5000]
private_y = test_answer[5000:]

In [8]:
train.to_csv('data2/train.csv', encoding='utf-8', index = False)
test_x.to_csv('data2/test_x.csv', encoding='utf-8', index = False)
test_answer.to_csv('data2/test_answer.csv', encoding='utf-8', index = False)
public_y.to_csv('data2/public_y.csv', encoding='utf-8', index = False)
private_y.to_csv('data2/private_y.csv', encoding='utf-8', index = False)