In [37]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval
from sklearn import preprocessing

In [38]:
# Load Census tract-level data
TRACT_DATA = pickle.load(open('../pickles/census_data_ml.p', 'rb'))

In [66]:
# Load pre-saved extracted listing data
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
extra_listing_data_df = pickle.load(open('../pickles/listings_extra_df.p', 'rb'))
# listing_topic_df = pickle.load(open('../pickles/listing_topic_df.p', 'rb'))

# Merge the listing data into one frame
listing_df = pd.merge(listing_df, extra_listing_data_df, on='id', how='left')

# Drop the description column, since we aren't using text features now
listing_df.drop('description', axis=1, inplace=True)

# Let's drop some more columns to see if we need them or not
drop_cols = ['review_scores_rating', 'review_scores_accuracy',
             'review_scores_cleanliness', 'review_scores_checkin',
             'review_scores_communication', 'review_scores_location',
             'review_scores_value', 'require_guest_phone_verification',
             'require_guest_profile_picture', 'instant_bookable',
             'host_is_superhost', 'host_identity_verified',
             'is_english', 'guests_included', 'extra_people'
            ]
listing_df.drop(drop_cols, axis=1, inplace=True)

# listing_df = pd.merge(listing_df, listing_topic_df, on='id', how='left')
listing_df.shape # Should be 26048 length

(26048, 62)

In [67]:
# Add census tract-level data, but only certain features
percent_vars = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
census_vars = [col for col in TRACT_DATA.columns
               if col in ('tract_id', 'B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E')
               or col in percent_vars]

tract_df = TRACT_DATA[census_vars].copy()

# Fill in missing percent columns with 0
percent_cols = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
tract_df[percent_cols] = tract_df[percent_cols].fillna(value=0.0)

In [68]:
# Join the tables 
merged=pd.merge(listing_df, tract_df, on='tract_id', how='left')

# Drop rows with null values 
merged.dropna(axis=0, inplace=True)

# Drop price outliers: massive boost in model accuracy
merged = merged[merged.price <= 1000]

# Drop listings with no reviews: prices haven't been
# validated by the market
merged = merged[merged.review_count > 0]


merged.shape

(19838, 78)

In [69]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build feature and target vectors
X = merged.drop(['id', 'estimated_revenue_per_month', 'reviews_per_month',
                 'block_group_id', 'tract_id', 'zipcode_id', 'neighborhood_id',
                 'price'], axis=1).copy()

y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type', 'cancellation_policy']
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Fit LabelEncoders to transform training and future prediction data
room_type_le = preprocessing.LabelEncoder().fit(X.room_type)
property_type_le = preprocessing.LabelEncoder().fit(X.property_type)
bed_type_le = preprocessing.LabelEncoder().fit(X.bed_type)
cancellation_policy_le = preprocessing.LabelEncoder().fit(X.cancellation_policy)

# Replace categorical columns with LabelEncoder transformed values
X['room_type'] = room_type_le.transform(X.room_type)
X['property_type'] = property_type_le.transform(X.property_type)
X['bed_type'] = bed_type_le.transform(X.bed_type)
X['cancellation_policy'] = cancellation_policy_le.transform(X.cancellation_policy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [70]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

# Train
model=RandomForestRegressor(n_estimators=10, max_features=None, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

# Predict/evaluate
y_predict = model.predict(X_test)
print('r^2: ', r2_score(y_test, y_predict))
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))

r^2:  0.71627961207
median absolute error:  19.75
mean absolute error:  37.7282762097


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [71]:
# Fit the model on the whole data, and pickle it
model.fit(X, y)
model_with_extras = {
    'model': model,
    'room_type_le': room_type_le,
    'property_type_le': property_type_le,
    'bed_type_le': bed_type_le,
    'cancellation_policy_le': cancellation_policy_le
}

pickle.dump(model_with_extras, open('../pickles/price_model_with_extras.p', 'wb'))

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.2s finished


In [21]:
# Take a look at some predictions vs. actual values

for i in range(1,1000)[::50]:
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

85, 60
725, 1000
386, 300
65, 90
184, 214
119, 115
146, 99
103, 57
74, 70
126, 199
408, 225
49, 46
147, 99
78, 60
419, 350
112, 100
68, 65
206, 175
52, 50
112, 119


In [206]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']

with pd.option_context('display.max_rows', None):
    print(feat_imp.sort_values('importance', ascending=False))



                            variable   importance
45                          bedrooms     0.436066
43                         bathrooms     0.125492
70                       B19301_001E    0.0618857
53                         room_type    0.0398884
42                  availability_365    0.0281551
50                         longitude    0.0225835
0                       accommodates    0.0203928
57                      extra_people     0.019021
49                          latitude    0.0133106
79       percent_bachelors_or_higher    0.0132785
72                       B25064_001E    0.0123709
46              host_experience_days    0.0122693
83              percent_homes_vacant    0.0119839
58              review_scores_rating   0.00928815
54                      review_count   0.00912898
74       percent_professional_degree   0.00845524
73                  percent_age_0_17   0.00814184
78                 percent_age_65_up   0.00802194
56                   guests_included   0.00754688


In [163]:
# Compare test vs. predicted
compare = pd.DataFrame([list(y_test), list(y_predict)]).transpose()
compare.columns = ['actual', 'predict']
compare['err'] = compare.predict - compare.actual
compare['abs_err'] = compare.err.apply(lambda x: abs(x))
compare.describe()

Unnamed: 0,actual,predict,err,abs_err
count,3968.0,3968.0,3968.0,3968.0
mean,143.414567,143.961769,0.547203,35.229486
std,122.678432,105.145599,63.956642,53.379041
min,0.0,20.7,-647.75,0.0
25%,75.0,76.4,-13.4625,7.45
50%,107.0,112.35,3.45,17.825
75%,169.0,171.5375,21.85,39.7125
max,1000.0,814.25,426.25,647.75


In [22]:
new_listing = {
    'address': 'compton, ca',
    'availability_365': 365,
    'room_type': 'Private room',
    'property_type': 'Apartment',
    'bed_type': 'Real Bed',
    'accommodates': 1,
    'guests_included': 1,
    'bedrooms': 1,
    'bathrooms': 1,
    'host_experience_days': 720,
    'extra_people': 0,
    'cancellation_policy': 'Flexible',
    'minimum_nights': 1,
    'amenity_1': False,
    'amenity_2': False,
    'amenity_3': False,
    'amenity_4': False,
    'amenity_5': False,
    'amenity_6': True,
    'amenity_7': False,
    'amenity_8': False,
    'amenity_9': False,
    'amenity_10': False,
    'amenity_11': False,
    'amenity_12': False,
    'amenity_13': False,
    'amenity_14': False,
    'amenity_15': False,
    'amenity_16': False,
    'amenity_17': False,
    'amenity_18': False,
    'amenity_19': False,
    'amenity_20': False,
    'amenity_21': False,
    'amenity_22': False,
    'amenity_23': False,
    'amenity_24': False,
    'amenity_25': False,
    'amenity_26': False,
    'amenity_27': False,
    'amenity_28': False,
    'amenity_29': False,
    'amenity_30': True,
    'amenity_31': False,
    'amenity_32': False,
    'amenity_33': False,
    'amenity_34': False,
    'amenity_35': False,
    'amenity_36': False,
    'amenity_37': False,
    'amenity_38': False,
    'amenity_39': False,
    'amenity_40': False,
    'amenity_41': False,
    'amenity_42': False,
    'amenity_43': False,
    'amenity_44': False,
}

from django.contrib.gis.geos import Point
import geocoder
def prep_prediction_features(listing_attrs):
    g = geocoder.google(listing_attrs['address'])
    if not g.ok:
        pass # return some error about how it's a bad address
    point = Point(x=g.lng, y=g.lat, srid=4326)
    tracts = Tract.objects.filter(mpoly__contains=point)
    if not tracts.exists():
        pass # return some error about how it's a bad address
    tract = tracts.first()
    if not tract.neighborhood_id:
        pass # return some error about how it's LA only

    # Fill in the tract, lat & lon
    listing_attrs['tract_id'] = tract.id
    listing_attrs['latitude'] = g.lat
    listing_attrs['longitude'] = g.lng
    df = pd.DataFrame(data=[listing_attrs], columns=listing_df.columns)
    df.fillna(listing_df.median(), inplace=True)
    df = pd.merge(df, tract_df, on='tract_id', how='left')
    df['property_type'] = property_type_le.transform(df.property_type)
    df['room_type'] = room_type_le.transform(df.room_type)
    df['bed_type'] = bed_type_le.transform(df.bed_type)
    df['cancellation_policy'] = cancellation_policy_le.transform(df.cancellation_policy)
    df.drop(['block_group_id', 'estimated_revenue_per_month', 
             'id', 'neighborhood_id', 'price', 'reviews_per_month', 
             'tract_id', 'zipcode_id'], axis=1, inplace=True)
    return df

In [29]:
# from treeinterpreter import treeinterpreter as ti
# df=prep_prediction_features(new_listing)
# prediction, bias, contributions = ti.predict(model, df.iloc[0].values.reshape(1,-1))
# print(prediction, bias)
# sorted(zip(df.columns, contributions[0]), key=lambda x: -abs(x[1]))

from api.predict import predict_price
contrib = predict_price(new_listing)['decomposition']


In [31]:
[x[0] for x in contrib]

['bedrooms',
 'room_type',
 'availability_365',
 'B19301_001E',
 'accommodates',
 'percent_homes_vacant',
 'percent_masters_degree',
 'longitude',
 'percent_age_65_up',
 'percent_bachelors_degree',
 'latitude',
 'minimum_nights',
 'extra_people',
 'percent_age_0_17',
 'amenity_2',
 'review_count',
 'percent_age_18_34',
 'B25001_001E',
 'percent_age_35_49',
 'amenity_12',
 'amenity_15',
 'amenity_1',
 'amenity_17',
 'amenity_8',
 'percent_associate_degree',
 'guests_included',
 'B25064_001E',
 'amenity_4',
 'cancellation_policy',
 'amenity_9',
 'percent_age_50_64',
 'amenity_29',
 'percent_bachelors_or_higher',
 'host_experience_days',
 'amenity_38',
 'percent_doctoral_degree',
 'bed_type',
 'percent_professional_degree',
 'bathrooms',
 'amenity_37',
 'amenity_32',
 'amenity_3',
 'B01003_001E',
 'amenity_18',
 'amenity_40',
 'amenity_10',
 'amenity_11',
 'amenity_13',
 'amenity_14',
 'amenity_16',
 'amenity_19',
 'amenity_20',
 'amenity_21',
 'amenity_22',
 'amenity_23',
 'amenity_24',


In [55]:
for x in sorted(list(listing_df.guests_included.unique())):
    print(x)

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0


In [58]:
listing_df[listing_df.guests_included == 0]

AttributeError: 'DataFrame' object has no attribute 'description'

In [75]:
LISTING_COLUMNS = pickle.load(open('../pickles/listing_columns_ml.p', 'rb'))

In [77]:
LISTING_COLUMNS = [c for c in LISTING_COLUMNS if c not in ['extra_people', 'guests_included']]