# Airbnb House Analysis and Recommender Application

##### Jie Bao, Kuangyi Zhang, Lanny Xu
##### Dr. Bamshad Mobasher, Spring 2018

## Import Library

In [2]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import *
from sklearn import neighbors, tree, naive_bayes, cross_validation
from sklearn.cross_validation import KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn import preprocessing
import operator

Populating the interactive namespace from numpy and matplotlib




## Read in data and preprocessing

In [3]:
listings = pd.read_table("listings_edited.csv", header='infer', delimiter=",")
print listings.shape
listings.head(5)

(5207, 39)


Unnamed: 0,id,listing_url,name,summary,picture_url,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood_cleansed,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,reviews_per_month
0,13824783,https://www.airbnb.com/rooms/13824783,Full floor of a city cottage (up to 4 guests),This 3rd-floor garrett apartment (built-out at...,https://a0.muscache.com/im/pictures/510d45f8-e...,within an hour,100%,t,t,Lincoln Square,...,99.0,10.0,10.0,10.0,10.0,10.0,10.0,t,moderate,2.57
1,16740225,https://www.airbnb.com/rooms/16740225,Guest Room of Two Bedroom Condo,Warm and secure room in a Two Bedroom Condo lo...,https://a0.muscache.com/im/pictures/697182d3-d...,within an hour,100%,f,t,Lincoln Square,...,,,,,,,,t,strict,
2,18125245,https://www.airbnb.com/rooms/18125245,"Cozy, spacious 2 flat in Lincoln Square!","This cute, spacious 2 flat in Lincoln Square i...",https://a0.muscache.com/im/pictures/bf761217-c...,within an hour,100%,f,t,Lincoln Square,...,95.0,10.0,10.0,10.0,10.0,10.0,10.0,f,moderate,4.0
3,8362570,https://www.airbnb.com/rooms/8362570,Lincoln Square Ravenswood,"Lincoln square, Ravenswood and Andersonville g...",https://a0.muscache.com/im/pictures/224a38c0-a...,within a few hours,100%,t,t,Lincoln Square,...,98.0,10.0,10.0,10.0,10.0,10.0,10.0,f,flexible,0.87
4,789867,https://www.airbnb.com/rooms/789867,Cozy Private Room in a Classic Chicago Appartm...,$40 OFF COUPON FOR ALL NEW AIRBNB GUESTS - VIS...,https://a0.muscache.com/im/pictures/72031963/0...,within an hour,100%,f,t,Lincoln Square,...,94.0,9.0,9.0,10.0,10.0,10.0,9.0,t,strict,1.96


In [4]:
listings.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,5207,,,,11491000.0,5320380.0,2384.0,7242070.0,13083500.0,15773300.0,18656800.0
listing_url,5207,5207.0,https://www.airbnb.com/rooms/6558625,1.0,,,,,,,
name,5206,5169.0,Wells St Suites - Old Town 3 Bed,3.0,,,,,,,
summary,5125,4841.0,This property offers luxury in Chicago’s trend...,23.0,,,,,,,
picture_url,5207,5202.0,https://a0.muscache.com/im/pictures/70087089/b...,3.0,,,,,,,
host_response_time,4951,4.0,within an hour,3257.0,,,,,,,
host_response_rate,4951,55.0,100%,3873.0,,,,,,,
host_is_superhost,5207,2.0,f,3840.0,,,,,,,
host_identity_verified,5207,2.0,t,4093.0,,,,,,,
neighbourhood_cleansed,5207,72.0,West Town,729.0,,,,,,,


In [5]:
listings.dtypes

id                               int64
listing_url                     object
name                            object
summary                         object
picture_url                     object
host_response_time              object
host_response_rate              object
host_is_superhost               object
host_identity_verified          object
neighbourhood_cleansed          object
state                           object
city                            object
zipcode                         object
property_type                   object
room_type                       object
accommodates                     int64
bathrooms                      float64
bedrooms                       float64
beds                           float64
bed_type                        object
amenities                       object
price                           object
security_deposit                object
cleaning_fee                    object
guests_included                  int64
extra_people             

In [6]:
# Id, Listing_url,Name, Summary, Picture_url
# Host_response_time, Host_response_rate, Host_identity_verified, Neighbourhood_cleansed
# City, State, Zipcode
# Property_type, Room_type, Accommodates, Bathrooms, bedrooms, Beds, Bed_type,
# Amenities: TV, wireless internet, air condition, heating, pets, washer, dryer
# Price, Security_deposit, Cleaning_fee, Guests_included, extra_people, minimum_nights, maximum_nights
# Number_of_reviews, Review_scores_rating, Instant_bookable, Cancellation_policy, reviews_per_month

In [109]:
# drop Listing_url, Name, Summary, Picture_url, neighbourhood_cleansed, state, city, zipcode
listings_edit = listings.drop(['listing_url', 'name', 'summary', 'picture_url', 'state', 'city', 'zipcode', 'neighbourhood_cleansed', 'reviews_per_month'], axis=1)

In [110]:
# Handle missing values. review_scores_rating review_scores_accuracy/_cleanliness/_checkin/_communication/_location/_value
listings_edit = listings_edit.dropna(subset=['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'])

In [111]:
listings_edit.shape

(4454, 30)

In [112]:
#only deal with the objects now

obj_listings_edit = listings_edit.select_dtypes(include=['object']).copy()
obj_listings_edit.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,property_type,room_type,bed_type,amenities,price,security_deposit,cleaning_fee,extra_people,instant_bookable,cancellation_policy
0,within an hour,100%,t,t,House,Private room,Real Bed,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",$56.00,,$30.00,$15.00,t,moderate
2,within an hour,100%,f,t,Townhouse,Private room,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$80.00,$150.00,$65.00,$15.00,f,moderate
3,within a few hours,100%,t,t,Condominium,Private room,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$80.00,,$10.00,$10.00,f,flexible
4,within an hour,100%,f,t,Apartment,Private room,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$20.00,,,$10.00,t,strict
5,within an hour,100%,f,t,Condominium,Private room,Real Bed,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",$28.00,,$35.00,$20.00,t,strict


In [113]:
# 方法1 手动输入
# host_response_time fill nan to none
listings_edit = listings_edit.fillna({"host_response_time": "none"})

In [114]:
# check how many types are there
listings_edit["host_response_time"].value_counts()

within an hour        2865
within a few hours     802
within a day           580
none                   181
a few days or more      26
Name: host_response_time, dtype: int64

In [115]:
# 手动给值
response_time_num = {"host_response_time":     {"within an hour": 1, "within a few hours": 2, 
                    "within a day": 3, "a few days or more": 4, "none": 5}}

In [116]:
# 直接在表里替换掉
listings_edit.replace(response_time_num, inplace=True)

In [117]:
# 检查看看
listings_edit["host_response_time"].value_counts()

1    2865
2     802
3     580
5     181
4      26
Name: host_response_time, dtype: int64

In [118]:
listings_edit = listings_edit.fillna({"host_is_superhost": "none"})
listings_edit["host_is_superhost"].value_counts()

f    3154
t    1300
Name: host_is_superhost, dtype: int64

In [119]:
superhost_num = {"host_is_superhost": {"t": 1, "f": 2}}
listings_edit.replace(superhost_num, inplace=True)

In [120]:
listings_edit["host_is_superhost"].value_counts()

2    3154
1    1300
Name: host_is_superhost, dtype: int64

In [121]:
# 方法2： 用sklearn的library种label encoder
# 直接在原始表格后添加一列新生成的0 或 1， 我个人更倾向于第二种和第三种
from sklearn.preprocessing import LabelEncoder  

In [122]:
le = LabelEncoder()
listings_edit["host_id_verified"] = le.fit_transform(listings_edit["host_identity_verified"])
listings_edit[["host_identity_verified", "host_id_verified"]].head(11)

Unnamed: 0,host_identity_verified,host_id_verified
0,t,1
2,t,1
3,t,1
4,t,1
5,t,1
6,t,1
7,t,1
10,t,1
11,t,1
12,t,1


In [123]:
listings_edit["property_type_code"] = le.fit_transform(listings_edit["property_type"])
listings_edit[["property_type", "property_type_code"]].head(11)

Unnamed: 0,property_type,property_type_code
0,House,10
2,Townhouse,16
3,Condominium,5
4,Apartment,0
5,Condominium,5
6,Apartment,0
7,Apartment,0
10,Apartment,0
11,House,10
12,Apartment,0


In [124]:
listings_edit["room_type_code"] = le.fit_transform(listings_edit["room_type"])
listings_edit[["room_type", "room_type_code"]].head(11)

Unnamed: 0,room_type,room_type_code
0,Private room,1
2,Private room,1
3,Private room,1
4,Private room,1
5,Private room,1
6,Private room,1
7,Private room,1
10,Entire home/apt,0
11,Entire home/apt,0
12,Entire home/apt,0


In [125]:
listings_edit["bed_type_code"] = le.fit_transform(listings_edit["bed_type"])
listings_edit[["bed_type", "bed_type_code"]].head(11)

Unnamed: 0,bed_type,bed_type_code
0,Real Bed,4
2,Real Bed,4
3,Real Bed,4
4,Real Bed,4
5,Real Bed,4
6,Futon,2
7,Real Bed,4
10,Real Bed,4
11,Real Bed,4
12,Futon,2


In [126]:
# 方法3：用panda的Label Encoding
# 同样是加在原始表格后面，不影响原数值

listings_edit["instant_bookable"] = listings_edit["instant_bookable"].astype('category')
listings_edit["instant_bookable_cat"] = listings_edit["instant_bookable"].cat.codes
listings_edit.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,property_type,room_type,accommodates,bathrooms,bedrooms,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,host_id_verified,property_type_code,room_type_code,bed_type_code,instant_bookable_cat
0,13824783,1,100%,1,t,House,Private room,4,1.0,1.0,...,10.0,10.0,10.0,t,moderate,1,10,1,4,1
2,18125245,1,100%,2,t,Townhouse,Private room,6,1.5,2.0,...,10.0,10.0,10.0,f,moderate,1,16,1,4,0
3,8362570,2,100%,1,t,Condominium,Private room,2,1.0,1.0,...,10.0,10.0,10.0,f,flexible,1,5,1,4,0
4,789867,1,100%,2,t,Apartment,Private room,3,1.0,1.0,...,10.0,10.0,9.0,t,strict,1,0,1,4,1
5,16701336,1,100%,2,t,Condominium,Private room,2,1.0,1.0,...,10.0,10.0,10.0,t,strict,1,5,1,4,1


http://pbpython.com/categorical-encoding.html
还有很多其他方法，我这里只选了3种

In [127]:
### host_response_rate Transfer to integers.

listings_edit = listings_edit.fillna({"host_response_rate": "0%"})

In [128]:
listings_edit['host_response_rate'] = listings_edit['host_response_rate'].str[:-1].astype('int')

In [107]:
listings_edit.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,property_type,room_type,accommodates,bathrooms,bedrooms,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,host_id_verified,property_type_code,room_type_code,bed_type_code,instant_bookable_cat
0,13824783,1,1.0,1,t,House,Private room,4,1.0,1.0,...,10.0,10.0,10.0,t,moderate,1,10,1,4,1
2,18125245,1,1.0,2,t,Townhouse,Private room,6,1.5,2.0,...,10.0,10.0,10.0,f,moderate,1,16,1,4,0
3,8362570,2,1.0,1,t,Condominium,Private room,2,1.0,1.0,...,10.0,10.0,10.0,f,flexible,1,5,1,4,0
4,789867,1,1.0,2,t,Apartment,Private room,3,1.0,1.0,...,10.0,10.0,9.0,t,strict,1,0,1,4,1
5,16701336,1,1.0,2,t,Condominium,Private room,2,1.0,1.0,...,10.0,10.0,10.0,t,strict,1,5,1,4,1


In [130]:
### price Transfer to integers.
listings_edit = listings_edit.fillna({"price": "$0"})

In [132]:
listings_edit['price'] = listings_edit['price'].str.rstrip('$').astype('float')

ValueError: could not convert string to float: $78.00