In [82]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval
from sklearn import preprocessing

In [83]:
# Load Census tract-level data
TRACT_DATA = pickle.load(open('../pickles/census_data_ml.p', 'rb'))

In [84]:
# Load pre-saved extracted listing data
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
extra_listing_data_df = pickle.load(open('../pickles/listings_extra_df.p', 'rb'))
# listing_topic_df = pickle.load(open('../pickles/listing_topic_df.p', 'rb'))

# Merge the listing data into one frame
listing_df = pd.merge(listing_df, extra_listing_data_df, on='id', how='left')

# Drop the description column, since we aren't using text features now
listing_df.drop('description', axis=1, inplace=True)

# listing_df = pd.merge(listing_df, listing_topic_df, on='id', how='left')
listing_df.shape # Should be 26048 length

(26048, 81)

In [85]:
# Add census tract-level data, but only certain features
percent_vars = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
census_vars = [col for col in TRACT_DATA.columns
               if col in ('tract_id', 'B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E')
               or col in percent_vars]

tract_df = TRACT_DATA[census_vars].copy()

# Fill in missing percent columns with 0
percent_cols = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
tract_df[percent_cols] = tract_df[percent_cols].fillna(value=0.0)

In [86]:
# Join the tables 
merged=pd.merge(listing_df, tract_df, on='tract_id', how='left')

# Drop rows with null values 
merged.dropna(axis=0, inplace=True)

# Drop price outliers: massive boost in model accuracy
merged = merged[merged.price <= 1000]


merged.shape

(25538, 97)

In [87]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build feature and target vectors
X = merged.drop(['id', 'estimated_revenue_per_month', 'reviews_per_month',
                 'block_group_id', 'tract_id', 'zipcode_id', 'neighborhood_id',
                 'price'], axis=1).copy()

y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type',]
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Fit LabelEncoders to transform training and future prediction data
room_type_le = preprocessing.LabelEncoder().fit(X.room_type)
property_type_le = preprocessing.LabelEncoder().fit(X.property_type)
bed_type_le = preprocessing.LabelEncoder().fit(X.bed_type)

# Replace categorical columns with LabelEncoder transformed values
X['room_type'] = room_type_le.transform(X.room_type)
X['property_type'] = property_type_le.transform(X.property_type)
X['bed_type'] = bed_type_le.transform(X.bed_type)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [88]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

# Train
model=RandomForestRegressor(n_estimators=30, max_features=None, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

# Predict/evaluate
y_predict = model.predict(X_test)
print('r^2: ', r2_score(y_test, y_predict))
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))

r^2:  0.684573682439
median absolute error:  20.45
mean absolute error:  40.509776934


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


In [89]:
# Fit the model on the whole data, and pickle it
model.fit(X, y)
model_with_extras = {
    'model': model,
    'room_type_le': room_type_le,
    'property_type_le': property_type_le,
    'bed_type_le': bed_type_le
}

pickle.dump(model_with_extras, open('../pickles/price_model_with_extras.p', 'wb'))

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.1s finished


In [94]:
# Take a look at some predictions vs. actual values

for i in range(1,1000)[::50]:
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

186, 145
136, 250
198, 199
171, 120
68, 75
83, 56
128, 80
111, 110
511, 275
98, 99
180, 180
63, 65
133, 95
111, 150
70, 49
110, 85
144, 125
158, 75
76, 75
406, 330


In [None]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']

with pd.option_context('display.max_rows', None):
    print(feat_imp.sort_values('importance', ascending=False))



In [97]:
# Compare test vs. predicted
compare = pd.DataFrame([list(y_test), list(y_predict)]).transpose()
compare.columns = ['actual', 'predict']
compare['err'] = compare.predict - compare.actual
compare['abs_err'] = compare.err.apply(lambda x: abs(x))
compare.describe()

Unnamed: 0,actual,predict,err,abs_err
count,8428.0,8428.0,8428.0,8428.0
mean,147.185335,149.663827,2.478492,40.394589
std,132.446583,113.289574,74.216397,62.308112
min,14.0,24.033333,-894.066667,0.0
25%,70.0,78.7,-12.933333,9.0
50%,102.0,113.466667,6.8,20.933333
75%,175.0,174.908333,26.033333,45.133333
max,1000.0,910.633333,568.6,894.066667


In [80]:
new_listing = {
    'address': 'Pacific Promenade, Playa Vista, CA',
    'availability_365': 365,
    'room_type': 'Private room',
    'property_type': 'Apartment',
    'bed_type': 'Real Bed',
    'accommodates': 2,
    'guests_included': 2,
    'host_experience_days': 360,
    'extra_people': 0,
    'review_count': 0,
    'minimum_nights': 1,
    'amenity_1': False,
    'amenity_2': False,
    'amenity_3': False,
    'amenity_4': False,
    'amenity_5': False,
    'amenity_6': True,
    'amenity_7': False,
    'amenity_8': False,
    'amenity_9': False,
    'amenity_10': False,
    'amenity_11': False,
    'amenity_12': False,
    'amenity_13': False,
    'amenity_14': False,
    'amenity_15': False,
    'amenity_16': False,
    'amenity_17': False,
    'amenity_18': False,
    'amenity_19': False,
    'amenity_20': False,
    'amenity_21': False,
    'amenity_22': False,
    'amenity_23': False,
    'amenity_24': False,
    'amenity_25': False,
    'amenity_26': False,
    'amenity_27': False,
    'amenity_28': False,
    'amenity_29': False,
    'amenity_30': False,
    'amenity_31': False,
    'amenity_32': False,
    'amenity_33': False,
    'amenity_34': False,
    'amenity_35': False,
    'amenity_36': False,
    'amenity_37': False,
    'amenity_38': False,
    'amenity_39': False,
    'amenity_40': False,
    'amenity_41': False,
    'amenity_42': False,
    'amenity_43': False,
    'amenity_44': False,
}

from django.contrib.gis.geos import Point
import geocoder
def prep_prediction_features(listing_attrs):
    g = geocoder.google(listing_attrs['address'])
    if not g.ok:
        pass # return some error about how it's a bad address
    point = Point(x=g.lng, y=g.lat, srid=4326)
    tracts = Tract.objects.filter(mpoly__contains=point)
    if not tracts.exists():
        pass # return some error about how it's a bad address
    tract = tracts.first()
    if not tract.neighborhood_id:
        pass # return some error about how it's LA only

    # Fill in the tract, lat & lon
    listing_attrs['tract_id'] = tract.id
    listing_attrs['latitude'] = g.lat
    listing_attrs['longitude'] = g.lng
    df = pd.DataFrame(data=[listing_attrs], columns=listing_df.columns)
    df.fillna(listing_df.median(), inplace=True)
    df = pd.merge(df, tract_df, on='tract_id', how='left')
    df['property_type'] = property_type_le.transform(df.property_type)
    df['room_type'] = room_type_le.transform(df.room_type)
    df['bed_type'] = bed_type_le.transform(df.bed_type)
    df.drop(['block_group_id', 'estimated_revenue_per_month', 
             'id', 'neighborhood_id', 'price', 'reviews_per_month', 
             'tract_id', 'zipcode_id'], axis=1, inplace=True)
    return df

In [81]:
# df=prep_prediction_features(new_listing)
# model.predict(df.iloc[0].values.reshape(1,-1))[0]

from api.predict import predict_price
predict_price(new_listing)


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


{'predicted_price': 127.36666666666666}

In [None]:
"""
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build feature and target vectors
X = merged.drop(['id', 'estimated_revenue_per_month', 'reviews_per_month',
                 'block_group_id', 'tract_id', 'zipcode_id', 'neighborhood_id',
                 'price'], axis=1).copy()

y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type',]
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Transform 

X = pd.get_dummies(X, columns=category_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
"""