In [2]:
#파일을 로드

import pandas as pd
d_df = pd.read_csv('data.csv', sep=',')

d_df.head()

print(d_df.shape)
print(d_df.dtypes)

(3176, 14)
age                     int64
job                    object
contact                object
food                   object
beverage               object
number_yoy              int64
favorite_continent     object
prev_destination       object
season                 object
destination            object
member                float64
period                  int64
transport              object
budget                  int64
dtype: object


In [3]:
#결손값 적은 항목 제거

d_df = d_df.dropna(subset=['member'])

print(d_df.shape)

(3143, 14)


In [4]:
#결손값 개수 확인

print(d_df.isnull().sum(axis=0))

age                     0
job                     0
contact                 0
food                  141
beverage                0
number_yoy              0
favorite_continent      0
prev_destination        0
season                  0
destination           234
member                  0
period                  0
transport               0
budget                  0
dtype: int64


In [6]:
#food, destination 항목의 결손값은 unknown으로 보완

d_df = d_df.fillna({'destination':'unknown'})
d_df = d_df.fillna({'food':'unknown'})

print(d_df.isnull().sum(axis=0))

age                   0
job                   0
contact               0
food                  0
beverage              0
number_yoy            0
favorite_continent    0
prev_destination      0
season                0
destination           0
member                0
period                0
transport             0
budget                0
dtype: int64


In [7]:
# age가 17세이상 100세미만이 아닌행을 삭제
d_df = d_df[d_df['age'] >= 17]
d_df = d_df[d_df['age'] < 100]

# 데이터 건수, 항목수를 확인
print(d_df.shape)

(2871, 14)


In [9]:
# 하와이 상품 구매 여부 y 목적변수 항목 추가

d_df['y'] = ["y" if p=='Hawaii of USA' else "n" for p in d_df['prev_destination']] 

# y와 n을 1과 0의 수치로 변환

d_df = d_df.replace('y', 1)
d_df = d_df.replace('n', 0)

d_df.head()

Unnamed: 0,age,job,contact,food,beverage,number_yoy,favorite_continent,prev_destination,season,destination,member,period,transport,budget,y
90,20,student,phone,sushi,Apple,8,North America,Hawaii of USA,summer,LA,1.0,6,ship,350000,1
91,20,student,phone,sushi,Apple,10,North America,Hawaii of USA,summer,Sydney,2.0,5,walking,350000,1
92,20,student,phone,sushi,Apple,17,North America,Hawaii of USA,summer,Prague,1.0,4,plane,350000,1
93,20,student,phone,sushi,Apple,17,South America,Hawaii of USA,summer,Prague,2.0,1,bicycle,300000,1
94,20,student,email,sushi,Apple,19,South America,Hawaii of USA,summer,Vancouver,1.0,2,ship,310000,1


In [10]:
# 문자열을 더미 변수로 변경
d_df_job = pd.get_dummies(d_df['job'])
d_df_contact = pd.get_dummies(d_df['contact'])
d_df_food = pd.get_dummies(d_df['food'])
d_df_beverage = pd.get_dummies(d_df['beverage'])
d_df_favorite_continent = pd.get_dummies(d_df['favorite_continent'])
d_df_season = pd.get_dummies(d_df['season'])
d_df_destination = pd.get_dummies(d_df['destination'])
d_df_transport  = pd.get_dummies(d_df['transport'])

# 선두에서 5행까지 표시
d_df_contact.head()

Unnamed: 0,email,letter,phone
90,0,0,1
91,0,0,1
92,0,0,1
93,0,0,1
94,1,0,0


In [11]:
#수치 항목을 추출
tmp1 = d_df[['age', 'number_yoy', 'member', 'period', 'budget', 'y']]

# 선두에서 5행까지 표시
tmp1.head()

Unnamed: 0,age,number_yoy,member,period,budget,y
90,20,8,1.0,6,350000,1
91,20,10,2.0,5,350000,1
92,20,17,1.0,4,350000,1
93,20,17,2.0,1,300000,1
94,20,19,1.0,2,310000,1


In [12]:
#데이터 결합

tmp2 = pd.concat([tmp1, d_df_destination], axis=1)
tmp3 = pd.concat([tmp2, d_df_job], axis=1)
tmp4 = pd.concat([tmp3, d_df_contact], axis=1)
tmp5 = pd.concat([tmp4, d_df_food], axis=1)
tmp6 = pd.concat([tmp5, d_df_beverage], axis=1)
tmp7 = pd.concat([tmp6, d_df_favorite_continent], axis=1)
tmp8 = pd.concat([tmp7, d_df_season], axis=1)
d_df_new = pd.concat([tmp8, d_df_transport], axis=1)

# 선두에서 5행까지 표시
d_df_new.head()

Unnamed: 0,age,number_yoy,member,period,budget,y,Hawaii,Ho Chi Minh,Hong Kong,Jeju,...,South America,autumn,fall,spring,summer,winter,bicycle,plane,ship,walking
90,20,8,1.0,6,350000,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
91,20,10,2.0,5,350000,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
92,20,17,1.0,4,350000,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
93,20,17,2.0,1,300000,1,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
94,20,19,1.0,2,310000,1,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0


In [13]:
#처리 결과를 CSV파일로 출력

d_df_new.to_csv('b-prep.csv', index=False)