# 데이터 전처리

In [1]:
import tensorflow as tf
import tensorflow.keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('listings_Denvor_1.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,host_since,host_is_superhost,neighbourhood_cleansed,city,market,...,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,0,360,LoHi Secret garden at the Chickadee Cottage,666,Jennifer & Giovanni,11,t,Highland,Denver,Denver,...,3,$140.00,$15.00,1,29,319,82,68,100.0,5.94
1,1,590,Comfortable - and a great value!,933,Jill,11,t,North Park Hill,Denver,Denver,...,1,$61.00,$5.00,1,300,64,585,37,97.0,4.57
2,2,592,private,933,Jill,11,t,North Park Hill,Denver,Denver,...,1,$42.00,$5.00,30,365,130,176,8,97.0,1.36
3,3,1940,Baker Studio Close to EVERYTHING,2150,Joanne,11,t,Baker,Denver,Denver,...,1,$95.00,$100.00,2,120,137,41,19,99.0,1.26
4,4,2086,Garden Level Condo,2284,Katy,11,f,Hale,Denver,Denver,...,1,$76.00,$25.00,180,1125,358,12,1,96.0,0.63


# 데이터 전처리 (신경망 삽입을 위해)
슈퍼호스트 예측에 필요한 정보 외에는 싸그리 삭제할 것임.

Unnamed:0, id, name, host_id, host_name 등 숙소 기본 정보 삭제

위치 관련 정보는 대동소이하나, neighbourhood 가 diverse하고 상세하므로 이것만 남기고 삭제

상세 좌표 삭제

In [4]:
data = data.drop('Unnamed: 0', axis=1)
data = data.drop('id', axis=1)
data = data.drop('name', axis=1)
data = data.drop('host_id', axis=1)
data = data.drop('host_name', axis=1)

In [5]:
data = data.drop('city', axis=1)
data = data.drop('market', axis=1)
data = data.drop('smart_location', axis=1)
data = data.drop('country_code', axis=1)
data = data.drop('country', axis=1)
data = data.drop('latitude', axis=1)
data = data.drop('longitude', axis=1)

In [6]:
data.head()

Unnamed: 0,host_since,host_is_superhost,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,11,t,Highland,Guesthouse,Entire home/apt,4,1.0,2,3,$140.00,$15.00,1,29,319,82,68,100.0,5.94
1,11,t,North Park Hill,House,Private room,3,1.0,1,1,$61.00,$5.00,1,300,64,585,37,97.0,4.57
2,11,t,North Park Hill,House,Private room,2,1.0,1,1,$42.00,$5.00,30,365,130,176,8,97.0,1.36
3,11,t,Baker,Guesthouse,Entire home/apt,2,1.0,0,1,$95.00,$100.00,2,120,137,41,19,99.0,1.26
4,11,f,Hale,Apartment,Entire home/apt,2,1.0,1,1,$76.00,$25.00,180,1125,358,12,1,96.0,0.63


neighbourhood_cleansed, property_type, room_type 을 정수형으로 매핑

In [7]:
data['neighbourhood_cleansed'].describe()

count            4124
unique             78
top       Five Points
freq              324
Name: neighbourhood_cleansed, dtype: object

In [8]:
def map_str_to_int(col_name):
    map_dict = {}
    j = 0

    for i in data[col_name]:
        if i not in map_dict.keys():
            map_dict[i] = j
            j+=1

    data[col_name] = data[col_name].map(map_dict) / j#normalize까지 한번에

In [9]:
map_str_to_int('neighbourhood_cleansed')
map_str_to_int('property_type')
map_str_to_int('room_type')

price와 extra_people에서 달러 표기 제거

In [10]:
data['price'] = data['price'].replace('[\$,]', '', regex=True).astype(float)
data['extra_people'] = data['extra_people'].replace('[\$,]', '', regex=True).astype(float)

In [11]:
data.head()

Unnamed: 0,host_since,host_is_superhost,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,11,t,0.0,0.0,0.0,4,1.0,2,3,140.0,15.0,1,29,319,82,68,100.0,5.94
1,11,t,0.012821,0.055556,0.25,3,1.0,1,1,61.0,5.0,1,300,64,585,37,97.0,4.57
2,11,t,0.012821,0.055556,0.25,2,1.0,1,1,42.0,5.0,30,365,130,176,8,97.0,1.36
3,11,t,0.025641,0.0,0.0,2,1.0,0,1,95.0,100.0,2,120,137,41,19,99.0,1.26
4,11,f,0.038462,0.111111,0.0,2,1.0,1,1,76.0,25.0,180,1125,358,12,1,96.0,0.63


딥러닝 용이하도록 값 전부 평준화

In [12]:
def normalize_val(col_name):
    max_val = data[col_name].max()
    
    data[col_name] = data[col_name] / max_val

In [13]:
normalize_val('host_since')
normalize_val('accommodates')
normalize_val('bathrooms')
normalize_val('bedrooms')
normalize_val('beds')
normalize_val('price')
normalize_val('extra_people')
normalize_val('minimum_nights')
normalize_val('maximum_nights')
normalize_val('availability_365')
normalize_val('number_of_reviews')
normalize_val('number_of_reviews_ltm')
normalize_val('review_scores_rating')
normalize_val('reviews_per_month')

In [14]:
data.head()

Unnamed: 0,host_since,host_is_superhost,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,reviews_per_month
0,1.0,t,0.0,0.0,0.0,0.222222,0.058824,0.222222,0.214286,0.014001,0.05,0.00089,0.01589,0.873973,0.102628,0.322275,1.0,0.320561
1,1.0,t,0.012821,0.055556,0.25,0.166667,0.058824,0.111111,0.071429,0.006101,0.016667,0.00089,0.164384,0.175342,0.732165,0.175355,0.97,0.246627
2,1.0,t,0.012821,0.055556,0.25,0.111111,0.058824,0.111111,0.071429,0.0042,0.016667,0.02669,0.2,0.356164,0.220275,0.037915,0.97,0.073394
3,1.0,t,0.025641,0.0,0.0,0.111111,0.058824,0.0,0.071429,0.009501,0.333333,0.001779,0.065753,0.375342,0.051314,0.090047,0.99,0.067998
4,1.0,f,0.038462,0.111111,0.0,0.111111,0.058824,0.111111,0.071429,0.007601,0.083333,0.160142,0.616438,0.980822,0.015019,0.004739,0.96,0.033999


In [15]:
# host_is_superhost 매핑하고 train, validate, test 분리
map_dict = {'t':1, 'f':0}

data['host_is_superhost'] = data['host_is_superhost'].map(map_dict)

unknown_data = data[data['host_is_superhost'] == np.nan]
data = data.dropna(subset=['host_is_superhost'])

#데이터 4124 개 중, unknown이 4개, train 3500개, vali 200개, test 420개 사용
train_x = data[:3500]
train_y = train_x['host_is_superhost']
train_x = train_x.drop('host_is_superhost', axis=1)

vali_x = data[3500:3700]
vali_y = vali_x['host_is_superhost']
vali_x = vali_x.drop('host_is_superhost', axis=1)

test_x = data[3700:]
test_y = test_x['host_is_superhost']
test_x = test_x.drop('host_is_superhost', axis=1)

print(train_x.shape())

TypeError: 'tuple' object is not callable

In [None]:
network = tf.keras.models.Sequential()
network.add(tf.keras.layers.Dense(255, input_shape=(train_x.shape()[0],), activation='relu'))
network.add(tf.keras.layers.Dense(255, activation='relu'))
network.add(tf.keras.layers.Dense((1), activation='sigmoid'))
network.compile(loss='mse', optimizer='Adam', metrics=['accuracy'])
network.summary()