In [103]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval
from sklearn import preprocessing

In [104]:
# Load Census tract-level data
TRACT_DATA = pickle.load(open('../pickles/census_data_ml.p', 'rb'))

In [105]:
# Load pre-saved extracted listing data
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
extra_listing_data_df = pickle.load(open('../pickles/listings_extra_df.p', 'rb'))
# listing_topic_df = pickle.load(open('../pickles/listing_topic_df.p', 'rb'))

# Merge the listing data into one frame
listing_df = pd.merge(listing_df, extra_listing_data_df, on='id', how='left')

# Drop the description column, since we aren't using text features now
listing_df.drop('description', axis=1, inplace=True)

# Let's drop some more columns to see if we need them or not
drop_cols = ['review_scores_rating', 'review_scores_accuracy',
             'review_scores_cleanliness', 'review_scores_checkin',
             'review_scores_communication', 'review_scores_location',
             'review_scores_value', 'require_guest_phone_verification',
             'require_guest_profile_picture', 'instant_bookable',
             'host_is_superhost', 'host_identity_verified',
             'is_english', 'guests_included', 'extra_people',]
listing_df.drop(drop_cols, axis=1, inplace=True)

# listing_df = pd.merge(listing_df, listing_topic_df, on='id', how='left')
listing_df.shape # Should be 26048 length

(26048, 62)

In [106]:
# Add census tract-level data, but only certain features
percent_vars = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
census_vars = [col for col in TRACT_DATA.columns
               if col in ('tract_id', 'B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E')
               or col in percent_vars]

tract_df = TRACT_DATA[census_vars].copy()

# Fill in missing percent columns with 0
percent_cols = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
tract_df[percent_cols] = tract_df[percent_cols].fillna(value=0.0)

In [107]:
# Join the tables 
merged=pd.merge(listing_df, tract_df, on='tract_id', how='left')

# Drop rows with null values 
merged.dropna(axis=0, inplace=True)

# Drop price outliers: massive boost in model accuracy
merged = merged[merged.price <= 1000]

# Drop listings with no reviews: prices haven't been
# validated by the market
merged = merged[merged.review_count > 0]

# Now, drop review count from the table
merged.drop(['review_count'], axis=1, inplace=True)

merged.shape

(19838, 77)

In [116]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build feature and target vectors
X = merged.drop(['id', 'estimated_revenue_per_month', 'reviews_per_month',
                 'block_group_id', 'tract_id', 'zipcode_id', 'neighborhood_id',
                 'price'], axis=1).copy()

y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type', 'cancellation_policy']
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Fit LabelEncoders to transform training and future prediction data
room_type_le = preprocessing.LabelEncoder().fit(X.room_type)
property_type_le = preprocessing.LabelEncoder().fit(X.property_type)
bed_type_le = preprocessing.LabelEncoder().fit(X.bed_type)
cancellation_policy_le = preprocessing.LabelEncoder().fit(X.cancellation_policy)

# Replace categorical columns with LabelEncoder transformed values
X['room_type'] = room_type_le.transform(X.room_type)
X['property_type'] = property_type_le.transform(X.property_type)
X['bed_type'] = bed_type_le.transform(X.bed_type)
X['cancellation_policy'] = cancellation_policy_le.transform(X.cancellation_policy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

# Train
model=RandomForestRegressor(n_estimators=12, max_features=None, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

# Predict/evaluate
y_predict = model.predict(X_test)
print('r^2: ', r2_score(y_test, y_predict))
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))

r^2:  0.708146216538
median absolute error:  18.875
mean absolute error:  37.0067204301


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:    0.0s finished


In [122]:
# Fit the model on the whole data, and pickle it
model.fit(X, y)
model_with_extras = {
    'model': model,
    'room_type_le': room_type_le,
    'property_type_le': property_type_le,
    'bed_type_le': bed_type_le,
    'cancellation_policy_le': cancellation_policy_le
}

pickle.dump(model_with_extras, open('../pickles/price_model_with_extras.p', 'wb'))

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.1s finished


In [101]:
# Take a look at some predictions vs. actual values

for i in range(1,1000)[::50]:
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

54, 45
246, 99
95, 125
623, 450
29, 35
30, 32
157, 150
68, 96
66, 60
117, 50
98, 80
88, 65
85, 45
67, 45
103, 97
391, 450
104, 100
254, 199
26, 25
71, 49


In [102]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']

with pd.option_context('display.max_rows', None):
    print(feat_imp.sort_values('importance', ascending=False))



                       variable   importance
45                     bedrooms     0.427693
43                    bathrooms     0.139894
54                  B19301_001E    0.0603381
51                    room_type    0.0398591
42             availability_365    0.0318574
48                    longitude    0.0253574
0                  accommodates    0.0214416
47                     latitude    0.0189149
46         host_experience_days    0.0165984
56                  B25064_001E    0.0160242
67         percent_homes_vacant    0.0144172
63  percent_bachelors_or_higher    0.0100832
58  percent_professional_degree   0.00968757
57             percent_age_0_17   0.00968297
49               minimum_nights   0.00850992
62            percent_age_65_up   0.00829772
65     percent_bachelors_degree   0.00823718
61      percent_doctoral_degree   0.00819261
60            percent_age_50_64   0.00802859
66     percent_associate_degree   0.00784407
68            percent_age_35_49   0.00746169
64        

In [63]:
# Compare test vs. predicted
compare = pd.DataFrame([list(y_test), list(y_predict)]).transpose()
compare.columns = ['actual', 'predict']
compare['err'] = compare.predict - compare.actual
compare['abs_err'] = compare.err.apply(lambda x: abs(x))
compare.describe()

Unnamed: 0,actual,predict,err,abs_err
count,4565.0,4565.0,4565.0,4565.0
mean,145.423439,148.132377,2.708938,36.867733
std,128.930469,110.320577,66.971418,55.973058
min,16.0,21.75,-577.8,0.0
25%,73.0,76.45,-11.75,7.5
50%,105.0,115.35,4.75,19.1
75%,170.0,178.65,24.95,40.9
max,1000.0,852.35,440.85,577.8


In [65]:
merged.describe()

Unnamed: 0,accommodates,availability_365,bathrooms,bedrooms,block_group_id,estimated_revenue_per_month,host_experience_days,id,latitude,longitude,...,percent_masters_degree,percent_age_50_64,percent_doctoral_degree,percent_age_65_up,percent_bachelors_or_higher,percent_age_18_34,percent_bachelors_degree,percent_associate_degree,percent_homes_vacant,percent_age_35_49
count,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,...,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0,18257.0
mean,3.409049,250.868708,1.317632,1.288821,11285.662759,1798.038122,876.000548,7483476.0,34.053736,-118.340734,...,0.111028,0.178958,0.022958,0.12961,0.496144,0.298901,0.31702,0.061838,0.082896,0.237141
std,2.296079,121.232367,0.695195,0.888752,6853.458432,1833.998483,574.953243,4303384.0,0.09813,0.144673,...,0.054891,0.049648,0.021623,0.052874,0.188278,0.110232,0.111319,0.02573,0.05118,0.046658
min,1.0,1.0,0.0,0.0,6.0,10.799561,2.0,109.0,33.339159,-118.911026,...,0.0,0.003424,0.0,0.0,0.0,0.054551,0.0,0.0,0.0,0.002634
25%,2.0,146.0,1.0,1.0,5374.0,579.660645,402.0,3869396.0,34.007836,-118.434717,...,0.068221,0.14591,0.006805,0.092082,0.360018,0.223575,0.246933,0.042936,0.049498,0.206462
50%,2.0,315.0,1.0,1.0,11087.0,1322.753906,795.0,7636406.0,34.066943,-118.349569,...,0.117669,0.174187,0.016737,0.123522,0.546922,0.289034,0.336176,0.059944,0.07426,0.23692
75%,4.0,348.0,1.5,2.0,17446.0,2394.0,1276.0,11493530.0,34.102632,-118.273889,...,0.150465,0.208707,0.034355,0.156115,0.649463,0.36449,0.403974,0.078321,0.107186,0.267069
max,16.0,365.0,8.0,10.0,23194.0,21000.0,3070.0,14319080.0,34.7327,-117.655471,...,0.287096,0.433079,0.213798,0.511981,0.842186,0.978667,0.590269,0.196335,0.435424,0.469223
