# 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('listings_Denvor.csv')

In [3]:
data.head()

Unnamed: 0,id,name,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_is_superhost,neighbourhood_cleansed,...,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,reviews_per_month
0,360,LoHi Secret garden at the Chickadee Cottage,666,Jennifer & Giovanni,7/8/2008,"Denver, Colorado, United States",within an hour,100%,t,Highland,...,$15.00,1,29,319,82,68,8/13/2018,9/20/2019,100.0,5.94
1,590,Comfortable - and a great value!,933,Jill,7/21/2008,"Denver, Colorado, United States",within an hour,100%,t,North Park Hill,...,$5.00,1,300,64,585,37,3/29/2009,9/29/2019,97.0,4.57
2,592,private,933,Jill,7/21/2008,"Denver, Colorado, United States",within an hour,100%,t,North Park Hill,...,$5.00,30,365,130,176,8,2/21/2009,9/6/2019,97.0,1.36
3,1940,Baker Studio Close to EVERYTHING,2150,Joanne,8/16/2008,"Denver, Colorado, United States",within an hour,100%,t,Baker,...,$100.00,2,120,137,41,19,1/24/2017,9/21/2019,99.0,1.26
4,2086,Garden Level Condo,2284,Katy,8/19/2008,"Denver, Colorado, United States",within an hour,100%,f,Hale,...,$25.00,180,1125,358,12,1,3/11/2018,10/21/2018,96.0,0.63


In [13]:
data.isnull().sum()

id                        0
name                      0
host_id                   0
host_name                 0
host_since                0
host_is_superhost         4
neighbourhood_cleansed    0
city                      0
market                    0
smart_location            0
country_code              0
country                   0
latitude                  0
longitude                 0
property_type             0
room_type                 0
accommodates              0
bathrooms                 0
bedrooms                  0
beds                      0
price                     0
extra_people              0
minimum_nights            0
maximum_nights            0
availability_365          0
number_of_reviews         0
number_of_reviews_ltm     0
review_scores_rating      0
reviews_per_month         0
dtype: int64

- host_name 은 'dummy name'으로 채우기
- host_since는 first_review로 채우고 둘다 null이면 삭제, DL 용이하도록 2019년에서 빼서 운영 기간으로 변환
- square_feet는 거의 전체가 null
- first_review와 last_review는 부정확한 데이터를 채워가며 복원하기엔 중요도가 떨어짐
- review_scores_rating과 review_per_month는 중요한 데이터인데 missing인 경우, 추천이 힘드므로, 복원하기보단 이 행들이 null인 열을 삭제
- cleaning_fee는 삭제
- host_is_superhost는 딥러닝하여 추론(다른 파일에서 할 것임)
- zipcode도 별로 중요치 않으므로 삭제
- host_response 관련 데이터는 숙소 추천에 아주 중요하지는 않으므로 삭제, host_name도 중요도 떨어지므로 삭제
- market 결손치는 city로 채우기
- city 는 host_location 잘라서 채우기
- state는 CO 밖에 없으니까 삭제
- host_location은 US, CO 까지 모두 동일하며 도시 정보는 city에 있으므로 삭제
- bathrooms는 중요할 수 있고, 해당 데이터가 없는 행이 얼마 없으므로 행을 삭제

In [5]:
data['host_name'] = data['host_name'].fillna('dummy name')

In [6]:
data['host_since'] = np.where(pd.notnull(data['host_since']) == True, data['host_since'], data['first_review'])
data['market'] = np.where(pd.notnull(data['market']) == True, data['market'], data['city'])

In [7]:
data["host_since"] = data["host_since"].astype('datetime64')
data["host_since"] = 2019 - data["host_since"].dt.year

In [8]:
data.loc[data['city'].isnull(),'city'] = data[data['city'].isnull()]['host_location'].apply(lambda x: x.split(',')[0])
data[data['id'] == 8402415]

Unnamed: 0,id,name,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_is_superhost,neighbourhood_cleansed,...,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,reviews_per_month
476,8402415,Beautiful Victorian in S. Broadway,4306842,Eric,7,"Denver, Colorado, United States",within a few hours,100%,t,Speer,...,$0.00,2,1125,322,127,15,10/1/2015,8/19/2019,100.0,2.61


In [9]:
data = data.drop('square_feet', axis=1)
data = data.drop('first_review', axis=1)
data = data.drop('last_review', axis=1)
data = data.drop('host_response_time', axis=1)
data = data.drop('host_response_rate', axis=1)
data = data.drop('cleaning_fee', axis=1)
data = data.drop('zipcode', axis=1)
data = data.drop('state', axis=1)
data = data.drop('host_location', axis=1)

In [10]:
data = data.dropna(subset=['review_scores_rating', 'reviews_per_month', 'bathrooms'])

In [11]:
data.head()

Unnamed: 0,id,name,host_id,host_name,host_since,host_is_superhost,neighbourhood_cleansed,city,market,smart_location,...,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,360,LoHi Secret garden at the Chickadee Cottage,666,Jennifer & Giovanni,11,t,Highland,Denver,Denver,"Denver, CO",...,3,$140.00,$15.00,1,29,319,82,68,100.0,5.94
1,590,Comfortable - and a great value!,933,Jill,11,t,North Park Hill,Denver,Denver,"Denver, CO",...,1,$61.00,$5.00,1,300,64,585,37,97.0,4.57
2,592,private,933,Jill,11,t,North Park Hill,Denver,Denver,"Denver, CO",...,1,$42.00,$5.00,30,365,130,176,8,97.0,1.36
3,1940,Baker Studio Close to EVERYTHING,2150,Joanne,11,t,Baker,Denver,Denver,"Denver, CO",...,1,$95.00,$100.00,2,120,137,41,19,99.0,1.26
4,2086,Garden Level Condo,2284,Katy,11,f,Hale,Denver,Denver,"Denver, CO",...,1,$76.00,$25.00,180,1125,358,12,1,96.0,0.63
