# 데이터 전처리

In [1]:
import tensorflow as tf
import tensorflow.keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def bar_chart(feature):
    t = data[data['host_is_superhost']==1][feature].value_counts()
    f = data[data['host_is_superhost']==0][feature].value_counts()
    df=pd.DataFrame([t, f])
    df.index = ['super', 'not super']
    df.plot(kind='bar', stacked=True, figsize=(10,5))

In [3]:
data = pd.read_csv('listings_Denvor_1.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,host_since,host_is_superhost,neighbourhood_cleansed,city,market,...,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,0,360,LoHi Secret garden at the Chickadee Cottage,666,Jennifer & Giovanni,11,t,Highland,Denver,Denver,...,3,$140.00,$15.00,1,29,319,82,68,100.0,5.94
1,1,590,Comfortable - and a great value!,933,Jill,11,t,North Park Hill,Denver,Denver,...,1,$61.00,$5.00,1,300,64,585,37,97.0,4.57
2,2,592,private,933,Jill,11,t,North Park Hill,Denver,Denver,...,1,$42.00,$5.00,30,365,130,176,8,97.0,1.36
3,3,1940,Baker Studio Close to EVERYTHING,2150,Joanne,11,t,Baker,Denver,Denver,...,1,$95.00,$100.00,2,120,137,41,19,99.0,1.26
4,4,2086,Garden Level Condo,2284,Katy,11,f,Hale,Denver,Denver,...,1,$76.00,$25.00,180,1125,358,12,1,96.0,0.63


# 데이터 전처리 (신경망 삽입을 위해)
슈퍼호스트 예측에 필요한 정보 외에는 싸그리 삭제할 것임.

Unnamed:0, id, name, host_id, host_name 등 숙소 기본 정보 삭제

위치 관련 정보는 대동소이하나, neighbourhood 가 diverse하고 상세하므로 이것만 남기고 삭제

상세 좌표 삭제

In [5]:
data = data.drop('Unnamed: 0', axis=1)
data = data.drop('id', axis=1)
data = data.drop('name', axis=1)
data = data.drop('host_id', axis=1)
data = data.drop('host_name', axis=1)

In [6]:
data = data.drop('city', axis=1)
data = data.drop('market', axis=1)
data = data.drop('smart_location', axis=1)
data = data.drop('country_code', axis=1)
data = data.drop('country', axis=1)
data = data.drop('latitude', axis=1)
data = data.drop('longitude', axis=1)

In [7]:
data.head()

Unnamed: 0,host_since,host_is_superhost,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,11,t,Highland,Guesthouse,Entire home/apt,4,1.0,2,3,$140.00,$15.00,1,29,319,82,68,100.0,5.94
1,11,t,North Park Hill,House,Private room,3,1.0,1,1,$61.00,$5.00,1,300,64,585,37,97.0,4.57
2,11,t,North Park Hill,House,Private room,2,1.0,1,1,$42.00,$5.00,30,365,130,176,8,97.0,1.36
3,11,t,Baker,Guesthouse,Entire home/apt,2,1.0,0,1,$95.00,$100.00,2,120,137,41,19,99.0,1.26
4,11,f,Hale,Apartment,Entire home/apt,2,1.0,1,1,$76.00,$25.00,180,1125,358,12,1,96.0,0.63


price와 extra_people에서 달러 표기 제거

In [8]:
data['price'] = data['price'].replace('[\$,]', '', regex=True).astype(float)
data['extra_people'] = data['extra_people'].replace('[\$,]', '', regex=True).astype(float)


- neighbourhood_cleansed 중 빈도가 50회 이하인 값은 Other로 치환
    - 이때, 인접지역으로 치환할 수 있지만 이건 귀찮...
- property_type, room_type은 비슷한 것끼리 묶어서 편하게 분류하고 50회 이하는 Other 치환
    - Hotel room 은 Entire home 과 비슷하고, shared room 은 적지만 남겨야함

이 때, Other가 신경망의 추론 결과에 영향을 주지 않도록 Other를 0으로 하는 벡터화

In [9]:
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<50, 'Other') if x.name=='neighbourhood_cleansed' else x)

data['property_type'] = data['property_type'].replace(['Aparthotel', 'Serviced apartment'], 'Apartment')
data['property_type'] = data['property_type'].replace('Villa', 'Bungalow')
data['property_type'] = data['property_type'].replace('Bungalow', 'Tiny house')
data['property_type'] = data['property_type'].replace('Castle', 'Other')
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<50, 'Other') if x.name=='property_type' else x)

data['room_type'] = data['room_type'].replace('Hotel room', 'Entire home/apt')

data['room_type'].value_counts()

Entire home/apt    3095
Private room        972
Shared room          57
Name: room_type, dtype: int64

In [10]:
data = pd.get_dummies(data) #원-핫 인코딩

data = data.drop('neighbourhood_cleansed_Other', axis=1)
data = data.drop('property_type_Other', axis=1)

data.head()

Unnamed: 0,host_since,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,...,property_type_Condominium,property_type_Guest suite,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Tiny house,property_type_Townhouse,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,11,4,1.0,2,3,140.0,15.0,1,29,319,...,0,0,1,0,0,0,0,1,0,0
1,11,3,1.0,1,1,61.0,5.0,1,300,64,...,0,0,0,1,0,0,0,0,1,0
2,11,2,1.0,1,1,42.0,5.0,30,365,130,...,0,0,0,1,0,0,0,0,1,0
3,11,2,1.0,0,1,95.0,100.0,2,120,137,...,0,0,1,0,0,0,0,1,0,0
4,11,2,1.0,1,1,76.0,25.0,180,1125,358,...,0,0,0,0,0,0,0,1,0,0


데이터 중 너무 값이 큰 열들 0~1로 범위 축소

In [11]:
def normalize(col):
    data[col] = data[col]/data[col].max()

normalize('host_since')
normalize('accommodates')
normalize('bathrooms')
normalize('bedrooms')
normalize('beds')
normalize('price')
normalize('extra_people')
normalize('minimum_nights')
normalize('maximum_nights')
normalize('availability_365')
normalize('number_of_reviews')
normalize('number_of_reviews_ltm')
normalize('review_scores_rating')
normalize('reviews_per_month')

data.describe()

Unnamed: 0,host_since,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,...,property_type_Condominium,property_type_Guest suite,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Tiny house,property_type_Townhouse,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
count,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,...,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0
mean,0.33725,0.23735,0.085276,0.185904,0.153215,0.016406,0.043323,0.005495,0.309453,0.36272,...,0.082687,0.098691,0.040737,0.384336,0.022551,0.049709,0.125606,0.750485,0.235694,0.013822
std,0.194473,0.154684,0.052092,0.121171,0.112619,0.044069,0.072783,0.025947,0.293052,0.334472,...,0.275441,0.298282,0.197705,0.486497,0.148485,0.217369,0.331445,0.432785,0.424483,0.116764
min,0.0,0.055556,0.0,0.0,0.0,0.001,0.0,0.00089,0.000548,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.181818,0.111111,0.058824,0.111111,0.071429,0.007201,0.0,0.00089,0.015342,0.060274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.363636,0.222222,0.058824,0.111111,0.142857,0.010001,0.016667,0.001779,0.2,0.238356,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.454545,0.333333,0.117647,0.222222,0.214286,0.015802,0.066667,0.001779,0.616438,0.638356,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
unknown_data = data[data['host_is_superhost_t']==0]
unknown_data = unknown_data[unknown_data['host_is_superhost_f']==0]
unknown_data = unknown_data.drop('host_is_superhost_t', axis=1)
unknown_data = unknown_data.drop('host_is_superhost_f', axis=1)
data = data.drop('host_is_superhost_f', axis=1)

#데이터 4124 개 중, unknown이 4개, train 3500개, vali 200개, test 420개 사용
train_x = data[:3700]
train_y = train_x['host_is_superhost_t']
train_x = train_x.drop('host_is_superhost_t', axis=1)

test_x = data[3700:]
test_y = test_x['host_is_superhost_t']
test_x = test_x.drop('host_is_superhost_t', axis=1)

print(train_x.shape)

(3700, 51)


# 학습

In [13]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [14]:
def new_model():
    input_layer = Input((train_x.shape[1]))
    
    x = Dense(64)(input_layer)
    x = ReLU()(x)
    x = Dense(64)(x)
    x = ReLU()(x)
    x = Dense(64)(x)
    x = ReLU()(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1)(x)
    output_layer = Activation('sigmoid')(x)
    
    model = Model(input_layer, output_layer)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

model = new_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 51)]              0         
_________________________________________________________________
dense (Dense)                (None, 64)                3328      
_________________________________________________________________
re_lu (ReLU)                 (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
re_lu_2 (ReLU)               (None, 64)                0     

In [15]:
model.fit(train_x, train_y, batch_size=500, epochs=30, shuffle=True, verbose=0)
model.evaluate(test_x, test_y, verbose=2)

424/424 - 0s - loss: 0.6233 - accuracy: 0.6745


[0.6233011315453727, 0.6745283]

In [16]:
print(train_x.shape)
print(unknown_data.shape)

(3700, 51)
(4, 51)


In [35]:
preds = model.predict(unknown_data)
print(preds)

[[0.31229997]
 [0.1284486 ]
 [0.25432885]
 [0.45966718]]


예측한 정보를 csv에 저장

In [80]:
data = pd.read_csv('listings_Denvor_1.csv')
data = data.drop('Unnamed: 0', axis=1)

label = []

for pred in preds:
    flag = 't'
    if pred < 0.5:
        flag = 'f'
    label.append(flag)

data.loc[data['host_is_superhost'].isnull(),'host_is_superhost'] = label
data[data['id'] == 184529]

data.to_csv("listings_Denvor_2.csv")