In [1]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval
from sklearn import preprocessing

In [2]:
# Load Census tract-level data
BLOCK_GROUP_DATA = pickle.load(open('../pickles/census_data_block_groups_gap_filled_ml.p', 'rb'))

In [3]:
# Load pre-saved extracted listing data
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
extra_listing_data_df = pickle.load(open('../pickles/listings_extra_df.p', 'rb'))

# Merge the listing data into one frame
listing_df = pd.merge(listing_df, extra_listing_data_df, on='id', how='left')

# Drop the description column, since we aren't using text features now
listing_df.drop('description', axis=1, inplace=True)

# Let's drop some more columns to see if we need them or not
drop_cols = ['review_scores_rating', 'review_scores_accuracy',
             'review_scores_cleanliness', 'review_scores_checkin',
             'review_scores_communication', 'review_scores_location',
             'review_scores_value', 'require_guest_phone_verification',
             'require_guest_profile_picture', 'instant_bookable',
             'host_is_superhost', 'host_identity_verified',
             'is_english', 'guests_included', 'extra_people',]
listing_df.drop(drop_cols, axis=1, inplace=True)

# listing_df = pd.merge(listing_df, listing_topic_df, on='id', how='left')
listing_df.shape # Should be 26048 length

(26048, 62)

In [5]:
# Add census block group data, but only certain features
percent_vars = [c for c in BLOCK_GROUP_DATA.columns if c.startswith('percent_')]
census_vars = [col for col in BLOCK_GROUP_DATA.columns
               if col in ('block_group_id', 'B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E')
               or col in percent_vars]

block_group_df = BLOCK_GROUP_DATA[census_vars].copy()

# Fill in missing percent columns with 0
percent_cols = [c for c in BLOCK_GROUP_DATA.columns if c.startswith('percent_')]
block_group_df[percent_cols] = block_group_df[percent_cols].fillna(value=0.0)

In [53]:
# Join the tables 
merged=pd.merge(listing_df, block_group_df, on='block_group_id', how='left')

# Drop rows with null values 
merged.dropna(axis=0, inplace=True)

# Drop price outliers: massive boost in model accuracy
merged = merged[merged.price <= 1000]

# Drop listings with no reviews: prices haven't been
# validated by the market
merged = merged[merged.review_count > 0]

# Drop listings with 0 days of availability
merged = merged[merged.availability_365 > 0]

# Now, drop review count from the table
merged.drop(['review_count'], axis=1, inplace=True)


merged.shape

(18262, 77)

In [58]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build feature and target vectors
X = merged.drop(['id', 'estimated_revenue_per_month', 'reviews_per_month',
                 'block_group_id', 'tract_id', 'zipcode_id', 'neighborhood_id',
                 'price'], axis=1).copy()

y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type', 'cancellation_policy']
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Fit LabelEncoders to transform training and future prediction data
room_type_le = preprocessing.LabelEncoder().fit(X.room_type)
property_type_le = preprocessing.LabelEncoder().fit(X.property_type)
bed_type_le = preprocessing.LabelEncoder().fit(X.bed_type)
cancellation_policy_le = preprocessing.LabelEncoder().fit(X.cancellation_policy)

# Replace categorical columns with LabelEncoder transformed values
X['room_type'] = room_type_le.transform(X.room_type)
X['property_type'] = property_type_le.transform(X.property_type)
X['bed_type'] = bed_type_le.transform(X.bed_type)
X['cancellation_policy'] = cancellation_policy_le.transform(X.cancellation_policy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=31)

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

# Train
model=RandomForestRegressor(n_estimators=20, max_features=None, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

# Predict/evaluate
y_predict = model.predict(X_test)
print('r^2: ', r2_score(y_test, y_predict))
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))

r^2:  0.699118953756
median absolute error:  19.15
mean absolute error:  37.5511607534


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.0s finished


In [None]:
# Fit the model on the whole data, and pickle it
model.fit(X, y)
model_with_extras = {
    'model': model,
    'room_type_le': room_type_le,
    'property_type_le': property_type_le,
    'bed_type_le': bed_type_le,
    'cancellation_policy_le': cancellation_policy_le
}

pickle.dump(model_with_extras, open('../pickles/bg_price_model_with_extras.p', 'wb'))

In [52]:
# Take a look at some predictions vs. actual values

for i in range(3,1000)[::50]:
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

117, 29
74, 80
155, 199
136, 140
79, 85
49, 49
69, 70
64, 69
214, 125
177, 159
398, 260
136, 134
262, 239
120, 115
148, 115
282, 299
209, 98
179, 125
110, 85
95, 99


In [None]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']

with pd.option_context('display.max_rows', None):
    print(feat_imp.sort_values('importance', ascending=False))



In [62]:
# Compare test vs. predicted
compare = pd.DataFrame([list(y_test), list(y_predict)]).transpose()
compare.columns = ['actual', 'predict']
compare['err'] = compare.predict - compare.actual
compare['abs_err'] = compare.err.apply(lambda x: abs(x))
compare.describe()

Unnamed: 0,actual,predict,err,abs_err
count,4566.0,4566.0,4566.0,4566.0
mean,145.260403,147.948581,2.688178,41.415584
std,124.929805,105.831281,71.502265,58.345312
min,16.0,37.644603,-822.247834,0.002323
25%,70.0,74.782752,-14.952624,12.0656
50%,105.0,112.688402,9.753591,23.99173
75%,175.0,163.88145,30.688402,48.180715
max,1000.0,747.002786,360.71786,822.247834


In [41]:
listing_df[listing_df.availability_365 == 0].describe()

Unnamed: 0,accommodates,availability_365,bathrooms,bedrooms,block_group_id,estimated_revenue_per_month,host_experience_days,id,latitude,longitude,minimum_nights,neighborhood_id,price,tract_id,zipcode_id,review_count,reviews_per_month
count,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0,3038.0
mean,2.833772,0.0,1.278802,1.270902,11123.257735,364.513608,777.702436,8548736.0,34.056291,-118.344903,3.379526,146.925938,156.312377,4169.233377,16154.199803,2.997038,0.372367
std,1.780428,0.0,0.653088,0.856909,6722.040458,821.199479,540.711686,3629676.0,0.092219,0.118607,9.307384,74.904651,301.497838,2409.132327,9436.241424,11.485642,0.806893
min,1.0,0.0,0.0,0.0,6.0,0.0,8.0,24345.0,33.340521,-118.840689,1.0,2.0,0.0,94.0,42.0,0.0,0.0
25%,2.0,0.0,1.0,1.0,5393.0,0.0,346.0,6156894.0,34.023507,-118.424883,1.0,78.0,70.0,1936.0,10290.0,0.0,0.0
50%,2.0,0.0,1.0,1.0,10738.5,45.443573,679.0,8799094.0,34.067566,-118.354025,2.0,147.0,100.0,5027.0,16826.0,1.0,0.070007
75%,4.0,0.0,1.0,1.0,15913.0,368.514404,1128.5,11777950.0,34.102358,-118.281531,3.0,214.0,150.0,6219.0,23780.75,2.0,0.379883
max,16.0,0.0,8.0,6.0,23194.0,17100.0,2774.0,14284300.0,34.675951,-117.705117,365.0,272.0,10000.0,8007.0,32389.0,322.0,8.929688
