In [378]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval
from sklearn import preprocessing

In [388]:
# Unpack 1D vector series into columns
def unpack_vectors(vector_series):
    VECTOR_LENGTH = 300
    new_df = pd.DataFrame()
    for i in range(VECTOR_LENGTH):
        new_df['description_vec_%d' % i] = vector_series.apply(lambda x: x[i])
    return new_df

In [389]:
# Load Census tract-level data
TRACT_DATA = pickle.load(open('../pickles/census_data_ml.p', 'rb'))

In [441]:
# Load pre-saved extracted listing data
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
extra_listing_data_df = pickle.load(open('../pickles/listings_extra_df.p', 'rb'))
# listing_topic_df = pickle.load(open('../pickles/listing_topic_df.p', 'rb'))

# Merge the listing data into one frame
listing_df = pd.merge(listing_df, extra_listing_data_df, on='id', how='left')

# Drop the description-related stuff (no text features)
listing_df.drop(['description', 'description_vec'], axis=1, inplace=True)

# Unpack the doc vectors and add back into dataframe; then drop the vector column
# unpacked = unpack_vectors(listing_df.description_vec)
# listing_df[unpacked.columns] = unpacked
# listing_df.drop('description_vec', axis=1, inplace=True)

# Let's drop some more columns to see if we need them or not
drop_cols = ['review_scores_rating', 'review_scores_accuracy',
             'review_scores_cleanliness', 'review_scores_checkin',
             'review_scores_communication', 'review_scores_location',
             'review_scores_value', 'require_guest_phone_verification',
             'require_guest_profile_picture', 'instant_bookable',
             'host_is_superhost', 'host_identity_verified',
             'is_english', 'guests_included', 'extra_people',
             'estimated_revenue_per_month', 'reviews_per_month',
             'block_group_id', 'zipcode_id', 'neighborhood_id',
            ]
listing_df.drop(drop_cols, axis=1, inplace=True)

# listing_df = pd.merge(listing_df, listing_topic_df, on='id', how='left')
listing_df.shape # Should be 26048 length

(26048, 57)

In [442]:
# Add census tract-level data, but only certain features
percent_vars = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
census_vars = [col for col in TRACT_DATA.columns
               if col in ('tract_id', 'B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E')
               or col in percent_vars]

tract_df = TRACT_DATA[census_vars].copy()

# Fill in missing percent columns with 0
percent_cols = [c for c in TRACT_DATA.columns if c.startswith('percent_')]
tract_df[percent_cols] = tract_df[percent_cols].fillna(value=0.0)

In [443]:
# Join the tables 
merged=pd.merge(listing_df, tract_df, on='tract_id', how='left')

# Drop rows with null values 
merged.dropna(axis=0, inplace=True)

# Drop price outliers: massive boost in model accuracy
merged = merged[merged.price <= 1000]

# Drop listings with no reviews: prices haven't been
# validated by the market
merged = merged[merged.review_count >= 5]

# Now, drop review count from the table
merged.drop(['review_count'], axis=1, inplace=True)

merged.shape

(12994, 72)

In [484]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Build id, row index lookup table (so that we can later derive
# the listing ids from the row index in X (which contains no id))
id_to_index_df = pd.DataFrame(list(zip(merged.id, merged.index)), columns=['id', 'X_index'])

# Build feature and target vectors
X = merged.drop(['id', 'tract_id', 'price'], axis=1).copy()
y = merged.price.copy()

# Convert numeric types to floats
category_cols = ['room_type', 'property_type', 'bed_type', 'cancellation_policy']
float_cols = [col for col in list(X.columns) if col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)

# Fit LabelEncoders to transform training and future prediction data
room_type_le = preprocessing.LabelEncoder().fit(X.room_type)
property_type_le = preprocessing.LabelEncoder().fit(X.property_type)
bed_type_le = preprocessing.LabelEncoder().fit(X.bed_type)
cancellation_policy_le = preprocessing.LabelEncoder().fit(X.cancellation_policy)

# Replace categorical columns with LabelEncoder transformed values
X['room_type'] = room_type_le.transform(X.room_type)
X['property_type'] = property_type_le.transform(X.property_type)
X['bed_type'] = bed_type_le.transform(X.bed_type)
X['cancellation_policy'] = cancellation_policy_le.transform(X.cancellation_policy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [446]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import median_absolute_error, mean_absolute_error

# Train
model=RandomForestRegressor(n_estimators=15, max_features=None, min_samples_leaf=8, n_jobs=-1, 
                            verbose=1, random_state=42)
model.fit(X_train, y_train)

print('Train score: %s' % model.score(X_train, y_train))
print('Test score: %s' % model.score(X_test, y_test))

# Predict/evaluate
y_predict = model.predict(X_test)
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished


Train score: 0.837915183871
Test score: 0.743824416441
median absolute error:  18.4781344415
mean absolute error:  33.9354927161


[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished


In [225]:
# Fit the model on the whole data, and pickle it
# model.fit(X, y)
# model_with_extras = {
#     'model': model,
#     'room_type_le': room_type_le,
#     'property_type_le': property_type_le,
#     'bed_type_le': bed_type_le,
#     'cancellation_policy_le': cancellation_policy_le
# }

# pickle.dump(model_with_extras, open('../pickles/price_model_with_extras.p', 'wb'))

In [404]:
# Take a look at some predictions vs. actual values

for i in range(3,1000)[::50]:
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

182, 127
27, 24
53, 65
263, 199
157, 199
106, 99
99, 100
81, 65
271, 395
100, 130
70, 70
184, 269
108, 80
122, 70
131, 195
98, 65
460, 249
147, 139
75, 59
176, 165


In [426]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']

with pd.option_context('display.max_rows', None):
    print(feat_imp.sort_values('importance', ascending=False))



                       variable   importance
45                     bedrooms       0.5301
43                    bathrooms     0.119503
51                    room_type    0.0590341
54                  B19301_001E    0.0490322
63  percent_bachelors_or_higher    0.0284418
48                    longitude    0.0196936
42             availability_365    0.0187893
67         percent_homes_vacant    0.0172215
0                  accommodates    0.0157679
58  percent_professional_degree    0.0141055
56                  B25064_001E    0.0127064
47                     latitude    0.0117314
64            percent_age_18_34   0.00764658
46         host_experience_days   0.00725235
57             percent_age_0_17   0.00680311
61      percent_doctoral_degree   0.00651873
59       percent_masters_degree    0.0064684
65     percent_bachelors_degree   0.00631671
3                    amenity_11   0.00517562
53                  B01003_001E   0.00502131
62            percent_age_65_up   0.00490082
50        

In [376]:
# Compare test vs. predicted
compare = pd.DataFrame([list(y_test), list(y_predict)]).transpose()
compare.columns = ['actual', 'predict']
compare['err'] = compare.predict - compare.actual
compare['abs_err'] = compare.err.apply(lambda x: abs(x))
compare.describe()

Unnamed: 0,actual,predict,err,abs_err
count,5198.0,5198.0,5198.0,5198.0
mean,142.99673,142.131821,-0.864908,33.935493
std,120.388853,98.462611,60.927223,50.606674
min,19.0,23.308678,-634.423019,0.012946
25%,75.0,78.586329,-14.581719,8.29955
50%,108.0,113.945244,4.013777,18.478134
75%,168.75,167.709842,21.694512,38.203975
max,1000.0,720.202446,369.132274,634.423019


In [685]:
# Find similar listings
weights = model.feature_importances_
key = X_test.iloc[78]

from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_scaled = pd.DataFrame(robust_scaler.fit_transform(X), columns=X.columns, index=X.index)
key = pd.Series(robust_scaler.transform(key.values.reshape(1,-1))[0], index=key.index)

def get_similar_listings(key, top_n=5):
    diffs = (X_scaled * weights) - (key * weights)
    dists = pd.DataFrame(np.sum(np.square(diffs), axis=1), columns=['distance'], index=X_scaled.index)
    index_vals = dists.sort_values('distance').index[:top_n]
    return [listing_lookup_df.get_value(idx, 'id') for idx in index_vals]

In [695]:
for listing_id in get_similar_listings(key):
    listing = Listing.objects.get(pk=listing_id)
    text = '%s ($%.0f) (%s)' % (listing.name, listing.price, listing.neighborhood.name)
    url = 'https://www.airbnb.com/rooms/%d' % listing.id
    print(text, url)

        

HISTORIC LUXE VENICE CARRIAGE HOUSE ($189) (Venice) https://www.airbnb.com/rooms/1661684
Venice Cottage 3 Blocks to Beach ($135) (Venice) https://www.airbnb.com/rooms/1159836
BRIGHT ONE BEDROOM IN PRIME VENICE! ($195) (Venice) https://www.airbnb.com/rooms/11223716
Venice Beach/Abbot Kinney Hideaway ($129) (Venice) https://www.airbnb.com/rooms/5879466
Modern Oasis w/Outdoor Patio, BBQ & Chef's Kitchen ($99) (Venice) https://www.airbnb.com/rooms/3011569


In [700]:
similar_listings = {
    'scaler': robust_scaler,
    'listing_lookup_df': listing_lookup_df,
    'all_listings_scaled': X_scaled,
    'weights': weights,
}
pickle.dump(similar_listings, open('../pickles/similar_listings.p', 'wb'))