# Airbnb Pricing Prediction: Final
**James Gearheart**<br>
**Danny Zhuang**<br>
**Bob Saludo**<br>
**Ryan Wallace**<br><br>
**Harvard University**<br>
**Fall 2016**<br>
**TF: Christine Hwang**<br>

# TODO
encoding of categoricals

transformations of cts features

duplicate listings so all properties have 365 observations  (I realized doing this will remove the bias towards dates that appear more frequently, but replace it with bias towards houses that are offered less. I can't see any way around this)

ridge regression and tuning

lasso regression and tuning

random forest regression and tuning

prediction intervals around predictions

---

In [33]:
# import necessary libraries
import csv
import datetime
import operator
import random
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LinReg
from sklearn import preprocessing
from sklearn.cluster import KMeans
%matplotlib inline

In [2]:
# read the three cleaned datasets
listings_df = pd.read_csv('datasets/listings_final.csv', delimiter=',', index_col=0)
reviews_df = pd.read_csv('datasets/reviews_final.csv', delimiter=',', index_col=0)
calendar_df = pd.read_csv('datasets/calendar_final.csv', index_col=0)

# log transform prices in calendar
calendar_df['price_log'] = np.log(calendar_df['price'])

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
calendar_df.drop(['available'], axis=1, inplace=True, errors='ignore')
listings_df.drop(['scrape_id', 'last_scraped', 'name', 'picture_url', 'host_id', 'host_name', 'host_since',
                  'host_picture_url', 'weekly_price', 'monthly_price', 'calendar_last_scraped', 'calendar_updated',
                  'street', 'market', 'extra_people'], axis=1, inplace=True, errors='ignore')
print calendar_df.columns.values
print listings_df.columns.values

['listing_id' 'date' 'price' 'diff_mean' 'price_log']
['listing_id' 'neighbourhood' 'neighbourhood_cleansed' 'city' 'zipcode'
 'market' 'latitude' 'longitude' 'is_location_exact' 'property_type'
 'room_type' 'accommodates' 'bathrooms' 'bedrooms' 'beds' 'bed_type'
 'price' 'guests_included' 'extra_people' 'minimum_nights' 'maximum_nights'
 'availability_30' 'availability_60' 'availability_90' 'availability_365'
 'number_of_reviews' 'review_scores_rating' 'review_scores_accuracy'
 'review_scores_cleanliness' 'review_scores_checkin'
 'review_scores_communication' 'review_scores_location'
 'review_scores_value' 'host_listing_count' 'price_log']


In [4]:
# create calendar with listings data added
calendar_expanded_df = calendar_df.merge(listings_df, on='listing_id', how='left', suffixes=['_calendar', '_listings'])

In [5]:
# strip errant NaN's and infinites from data import errors and log transformation
print 'calendar entries before stripping:', calendar_expanded_df.shape
calendar_expanded_df = calendar_expanded_df.replace([np.inf, -np.inf], np.nan)
calendar_expanded_df = calendar_expanded_df.dropna()
print 'calendar entries after stripping:', calendar_expanded_df.shape

calendar entries before stripping: (7201883, 39)
calendar entries after stripping: (7201153, 39)


In [6]:
# method to convert date to day of week
def get_day(date):
    return datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%A')

In [7]:
# create indicators for time variables
# weekend
dates = np.array(calendar_expanded_df['date'])
days = [get_day(date) for date in dates]
weekend = [1 if day == 'Friday' or day == 'Saturday' else 0 for day in days]
calendar_expanded_df['weekend'] = pd.Series(np.array(weekend), index=calendar_expanded_df.index)

# major holidys (around New Years)
holiday_dates = ['2015-01-01', '2015-01-02', '2015-01-03']
holiday = [1 if date in holiday_dates else 0 for date in dates]
calendar_expanded_df['holiday'] = pd.Series(np.array(holiday), index=calendar_expanded_df.index)

# not January (excluding Holidays) or February
slump_dates = []
for d in range(4, 10):
    slump_dates.append('2015-01-0' + str(d))
for d in range(10, 32):
    slump_dates.append('2015-01-' + str(d))
for d in range(1, 10):
    slump_dates.append('2015-02-0' + str(d))
for d in range(10, 29):
    slump_dates.append('2015-01-' + str(d))
slump = [1 if date in slump_dates else 0 for date in dates]
calendar_expanded_df['slump'] = pd.Series(np.array(slump), index=calendar_expanded_df.index)

# Jan (after Jan 3), Feb, and March each appear to have different values
jan_dates = []
feb_dates = []
march_dates = []
for d in range(4, 10):
    jan_dates.append('2015-01-0' + str(d))
for d in range(10, 32):
    jan_dates.append('2015-01-' + str(d))
for d in range(1, 10):
    feb_dates.append('2015-02-0' + str(d))
for d in range(10, 29):
    feb_dates.append('2015-01-' + str(d))
for d in range(1, 10):
    march_dates.append('2015-01-0' + str(d))
for d in range(10, 32):
    march_dates.append('2015-01-' + str(d))
jan = [1 if date in jan_dates else 0 for date in dates]
feb = [1 if date in feb_dates else 0 for date in dates]
march = [1 if date in march_dates else 0 for date in dates]

calendar_expanded_df['jan'] = pd.Series(np.array(jan), index=calendar_expanded_df.index)
calendar_expanded_df['feb'] = pd.Series(np.array(feb), index=calendar_expanded_df.index)
calendar_expanded_df['march'] = pd.Series(np.array(march), index=calendar_expanded_df.index)


In [8]:
# find means by zipcode and group into zipcodes
neighborhoods = calendar_expanded_df['neighbourhood'].unique()
zipcodes = calendar_expanded_df['zipcode'].unique()

neighborhood_prices = []
for neighborhood in neighborhoods:
    neighborhood_prices.append((neighborhood, np.mean(np.array(listings_df[listings_df['neighbourhood'] == neighborhood]['price']))))

zipcode_prices = []
for zipcode in zipcodes:
    zipcode_prices.append((zipcode, np.mean(np.array(listings_df[listings_df['zipcode'] == zipcode]['price']))))
    
# group zipcodes and neighborhoods into quartiles by average
neighborhood_prices.sort(key=operator.itemgetter(1), reverse=True)
zipcode_prices.sort(key=operator.itemgetter(1), reverse=True)

# find size of quartiles
neighborhood_quartile_size = int(np.round(len(neighborhood_prices)*0.25))
zipcode_quartile_size = int(np.round(len(zipcode_prices)*0.25))

# break up neighboorhood and zipcodes by quartile
neighborhood_1 = neighborhood_prices[:neighborhood_quartile_size]
neighborhood_2 = neighborhood_prices[neighborhood_quartile_size:2*neighborhood_quartile_size]
neighborhood_3 = neighborhood_prices[2*neighborhood_quartile_size:3*neighborhood_quartile_size]
neighborhood_4 = neighborhood_prices[3*neighborhood_quartile_size:]

zipcode_1 = zipcode_prices[:zipcode_quartile_size]
zipcode_2 = zipcode_prices[zipcode_quartile_size:2*zipcode_quartile_size]
zipcode_3 = zipcode_prices[2*zipcode_quartile_size:3*zipcode_quartile_size]
zipcode_4 = zipcode_prices[3*zipcode_quartile_size:]

# create new indicators for each quartile
neighborhoods = np.array(calendar_expanded_df['neighbourhood'])
zipcodes = np.array(calendar_expanded_df['zipcode'])

neighborhood_q1 = []
neighborhood_q2 = []
neighborhood_q3 = []
neighborhood_q4 = []

zipcode_q1 = []
zipcode_q2 = []
zipcode_q3 = []
zipcode_q4 = []

for neighborhood in neighborhoods:
    if neighborhood in neighborhood_1:
        neighborhood_q1.append(1)
    else:
        neighborhood_q1.append(0)
    
    if neighborhood in neighborhood_2:
        neighborhood_q2.append(1)
    else:
        neighborhood_q2.append(0)
    
    if neighborhood in neighborhood_3:
        neighborhood_q3.append(1)
    else:
        neighborhood_q3.append(0)
        
    if neighborhood in neighborhood_4:
        neighborhood_q4.append(1)
    else:
        neighborhood_q4.append(0)
        
for zipcode in zipcodes:
    if zipcode in zipcode_1:
        zipcode_q1.append(1)
    else:
        zipcode_q1.append(0)
    
    if zipcode in zipcode_2:
        zipcode_q2.append(1)
    else:
        zipcode_q2.append(0)
    
    if zipcode in zipcode_3:
        zipcode_q3.append(1)
    else:
        zipcode_q3.append(0)
        
    if zipcode in zipcode_4:
        zipcode_q4.append(1)
    else:
        zipcode_q4.append(0)

# convert zipcode, neighborhood lists to np arrays
calendar_expanded_df['zipcode_q1'] = pd.Series(np.resize(np.array(zipcode_q1), (len(zipcode_q1), 1)))
calendar_expanded_df['zipcode_q2'] = pd.Series(np.resize(np.array(zipcode_q2), (len(zipcode_q2), 1)))
calendar_expanded_df['zipcode_q3'] = pd.Series(np.resize(np.array(zipcode_q3), (len(zipcode_q3), 1)))
calendar_expanded_df['zipcode_q4'] = pd.Series(np.resize(np.array(zipcode_q4), (len(zipcode_q4), 1)))

calendar_expanded_df['neighborhood_q1'] = pd.Series(np.resize(np.array(neighborhood_q1), (len(neighborhood_q1), 1)))
calendar_expanded_df['neighborhood_q2'] = pd.Series(np.resize(np.array(neighborhood_q2), (len(neighborhood_q2), 1)))
calendar_expanded_df['neighborhood_q3'] = pd.Series(np.resize(np.array(neighborhood_q3), (len(neighborhood_q3), 1)))
calendar_expanded_df['neighborhood_q4'] = pd.Series(np.resize(np.array(neighborhood_q4), (len(neighborhood_q4), 1)))

In [26]:
# binarize categorical variables
property_type_dummies = pd.get_dummies(calendar_expanded_df['property_type'])
property_type_cols = list(property_type_dummies.columns.values)

is_location_exact_dummies = pd.get_dummies(calendar_expanded_df['is_location_exact'])
is_location_exact_cols = list(is_location_exact_dummies.columns.values)

room_type_dummies = pd.get_dummies(calendar_expanded_df['room_type'])
room_type_cols = list(room_type_dummies.columns.values)

bed_type_dummies = pd.get_dummies(calendar_expanded_df['bed_type'])
bed_type_cols = list(bed_type_dummies.columns.values)


In [27]:
print 'property_type_cols: ', property_type_cols
print 'is_location_exact_cols: ', is_location_exact_cols
print 'room_type_cols: ', room_type_cols
print 'bed_type_cols: ', bed_type_cols

property_type_cols:  ['Apartment', 'Bed & Breakfast', 'Boat', 'Cabin', 'Camper/RV', 'Castle', 'Cave', 'Chalet', 'Dorm', 'Earth House', 'House', 'Hut', 'Lighthouse', 'Loft', 'Other', 'Tent', 'Treehouse', 'Villa']
is_location_exact_cols:  ['f', 't']
room_type_cols:  ['Entire home/apt', 'Private room', 'Shared room']
bed_type_cols:  ['Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed']


In [None]:
# visualize continusous random variables against response variable
# to determine if any transformations of the predictors would be helpful
cts_vars_to_plot = ['latitude', 'longitude',
                    'accommodates', 'bathrooms', 'bedrooms', 'beds', 
                    'guests_included', 'minimum_nights', 'maximum_nights',
                    'availability_30', 'availability_60', 'availability_90', 'availability_365',
                    'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
                    'review_scores_cleanliness', 'review_scores_checkin',
                    'review_scores_communication', 'review_scores_location',
                    'review_scores_value', 'host_listing_count']

fig, ax = plt.subplots(1, len(cts_vars_to_plot), figsize=(20, 5*len(cts_vars_to_plot)))
for i, var in enumerate(cts_vars_to_plot):
    ax[i].scatter(calendar_expanded_df[var], calendar_expanded_df['price_log_calendar'])
    ax[i].set_title('Log Price against ' + var)
    ax[i].set_xlabel(var)
    ax[i].set_ylabel('Log Price')

plt.tight_layout()
plt.show()

In [28]:
# extract relevant feature listing
relevant_vars1 = ['bathrooms', 'bedrooms', 'beds', 'accommodates', 'longitude', 
                  'weekend', 'holiday', 'slump']

cts_vars = ['latitude', 'longitude', 
            'accommodates', 'bathrooms', 'bedrooms', 'beds', 
            'guests_included', 'minimum_nights', 'maximum_nights',
            'availability_30', 'availability_60', 'availability_90', 'availability_365',
            'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
            'review_scores_cleanliness', 'review_scores_checkin',
            'review_scores_communication', 'review_scores_location',
            'review_scores_value', 'host_listing_count']

categorical_vars = ['zipcode_q1', 'zipcode_q2', 'zipcode_q3', 'zipcode_q4',
                   'neighborhood_q1', 'neighborhood_q2', 'neighborhood_q3', 'neighborhood_q4',
                   'weekend', 'holiday', 'slump', 'jan', 'feb', 'march'] + \
                    property_type_cols + is_location_exact_cols + room_type_cols + bed_type_cols

X_df = calendar_expanded_df[cts_vars, categorical_vars].copy()
y_df = calendar_expanded_df[['price_log_calendar']].copy()

# numpy for sklearn
X = X_df.as_matrix()
y = y_df.as_matrix()

['zipcode_q1', 'zipcode_q2', 'zipcode_q3', 'zipcode_q4', 'neighborhood_q1', 'neighborhood_q2', 'neighborhood_q3', 'neighborhood_q4', 'Apartment', 'Bed & Breakfast', 'Boat', 'Cabin', 'Camper/RV', 'Castle', 'Cave', 'Chalet', 'Dorm', 'Earth House', 'House', 'Hut', 'Lighthouse', 'Loft', 'Other', 'Tent', 'Treehouse', 'Villa', 'f', 't', 'Entire home/apt', 'Private room', 'Shared room', 'Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed']


In [11]:
# Split into training and testing
Xy = np.concatenate((X, y), axis=1)

# use 75% for training, the rest for testing
num_train = int(np.round(Xy.shape[0]*0.75))

# shuffle for random selection
random.shuffle(Xy)

# pull out sets
X_train = Xy[:num_train,:(-1)]
X_test = Xy[num_train:,:(-1)]
y_train = Xy[:num_train, -1]
y_test = Xy[num_train:, -1]

X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)
y_train = np.nan_to_num(y_train)
y_test = np.nan_to_num(y_test)

In [12]:
# fit simple linear regression
linear_model = LinReg()
linear_model.fit(X_train, y_train)
print 'R^2 in test: ', linear_model.score(X_test, y_test)

R^2 in test:  0.378520858319
