In [1]:
import pandas as pd
import numpy as np
import pickle
from django.db.models import F, ExpressionWrapper, Expression, DurationField
from django.contrib.gis.db.models.functions import AsGeoJSON
from ast import literal_eval

In [58]:
# Load Census tract-level data
TRACT_DATA = pickle.load(open('../pickles/census_data.p', 'rb'))['dataframe']

In [None]:
# Build a dataframe for all listings that have a non-null tract relation
all_listing_data = pd.DataFrame(extract_listing_data())

In [None]:
# Pull raw listing data out of DB in one swoop
# But exclude listings with a tract not in the census tract dataframe
all_tract_ids_in_df = list(TRACT_DATA.tract_id)
raw_data = (Listing.objects
    .filter(tract_id__in=all_tract_ids_in_df)
    .annotate(point_geojson=AsGeoJSON('point'))
    .annotate(host_experience_days=ExpressionWrapper(
            F('last_scraped') - F('host_since'), 
            output_field=DurationField()))
    ).values('id',
             'host_experience_days', 
             'point_geojson',
             'neighborhood_id',
             'tract_id',
             'zipcode_id',
             'block_group_id',
             'host_is_superhost',
             'host_identity_verified',
             'property_type',
             'room_type',
             'accommodates',
             'bathrooms',
             'bedrooms',
             'bed_type',
             'minimum_nights',
             'price',
             'availability_365',
             'estimated_revenue_per_month')

# Convert QuerySet to plain list of dicts
raw_data = list(raw_data)

# Convert timedeltas to ints; convert geojson points to lat, lon variables;
for item in raw_data:
    # geojson -- > lat, lon
    geojson = literal_eval(item['point_geojson']) # Parse json string
    item['longitude'] = geojson['coordinates'][0]
    item['latitude'] = geojson['coordinates'][1]
    item.pop('point_geojson') # we don't need this anymore    
    # timedelta --> int
    item['host_experience_days'] = item['host_experience_days'].days

# Add amenity fields
all_amenity_ids = Amenity.objects.order_by('id').values_list('id', flat=True)    
for item in raw_data:
    its_amenities = Listing.objects.get(id=item['id']).amenities.values_list('id', flat=True)
    amenity_data = {
        'amenity_%d' % amenity_id: amenity_id in its_amenities 
        for amenity_id in all_amenity_ids
    }
    item.update(amenity_data)

In [244]:
# Load pre-saved extracted listing data
# NOTE: This now includes a lot more data than is extracted above, so don't overwrite it!
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
# listing_topic_df = pickle.load(open('../pickles/listing_topic_df.p', 'rb'))

In [245]:
# Add census tract-level data
census_vars = [col for col in TRACT_DATA.columns
              if col not in ('geoid', 'neighborhood_id', 
                           'state', 'county', 'tract')]
tract_df = TRACT_DATA[census_vars]

In [246]:
# Join the tables
merged=pd.merge(listing_df, tract_df, on='tract_id', how='left')


In [247]:
merged[merged.isnull().any(axis=1)].shape

(1913, 147)

In [248]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

data = merged.dropna(axis=0).copy()

# Drop outliers: massive performance boost
data = data[data.price <= 1000]

#Keep only listing columns and certain census columns
data = data[[c for c in data.columns 
             if c in listing_df.columns 
             or c in ('B25064_001E', 'B19301_001E', 'B01003_001E', 'B25001_001E') 
             or 'percent' in c]]

X = data[[c for c 
          in list(data.columns) 
          if c not in ('id', 
                        'estimated_revenue_per_month', 
                        'reviews_per_month',
#                         'review_count',
                        'description',
                        'price')]]
y = data[['price']]

# Extract text features
# tf_vec = TfidfVectorizer(analyzer='word',
#                          ngram_range=(1,2),
#                          stop_words='english')
# text_features = tf_vec.fit_transform(data.description)

int_cols = ['id', 'tract_id', 'block_group_id', 'neighborhood_id', 
            'zipcode_id',]
category_cols = ['room_type', 'property_type', 'bed_type']
float_cols = [col for col in list(X.columns) 
              if col not in int_cols and col not in category_cols]
X[float_cols] = X[float_cols].astype(np.float64)
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# # Scale features
# standard_scaler = StandardScaler()
# X_train = standard_scaler.fit_transform(X_train)
# X_test = standard_scaler.transform(X_test)

#PCA dimensionality reduction
# pca = decomposition.PCA(n_components=15)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [251]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

# Train
model = RandomForestRegressor(n_estimators=500)
#model = GradientBoostingRegressor(loss='lad', n_estimators=500)
model.fit(X_train, y_train)

# Predict/evaluate
y_predict = model.predict(X_test)
print('r^2: ', r2_score(y_test, y_predict))
print('median absolute error: ', median_absolute_error(y_test, y_predict))
print('mean absolute error: ', mean_absolute_error(y_test, y_predict))



r^2:  0.70036300156
median absolute error:  20.598
mean absolute error:  40.5983903061


In [252]:
# Take a look at some predictions vs. actual values

for i in range(1300,1336):
    predict = y_predict[i]
    actual = y_test.iloc[i]
    print('%.0f,' % predict, '%.0f' % actual)

84, 96
121, 85
201, 185
128, 125
63, 85
139, 110
125, 99
143, 70
42, 43
90, 95
66, 66
62, 62
123, 155
91, 110
129, 99
113, 125
84, 50
110, 150
205, 145
262, 278
200, 160
115, 125
143, 175
90, 75
208, 250
150, 130
73, 60
75, 69
209, 350
64, 109
275, 165
96, 85
120, 56
57, 50
189, 89
62, 60


In [70]:
age_vars = [
    ['B01001_003E', 'B01001_004E', 'B01001_005E', 'B01001_006E', 'B01001_027E',
     'B01001_028E', 'B01001_029E', 'B01001_030E'], # 0-17
    ['B01001_007E', 'B01001_008E', 'B01001_009E', 'B01001_010E', 'B01001_011E', 'B01001_012E',
     'B01001_031E', 'B01001_032E', 'B01001_033E', 'B01001_034E', 'B01001_035E', 'B01001_036E'], # 18-34
    ['B01001_013E', 'B01001_014E', 'B01001_015E', 'B01001_037E', 'B01001_038E', 'B01001_039E'], # 35-49
    ['B01001_016E', 'B01001_017E', 'B01001_018E', 'B01001_019E', 'B01001_040E', 'B01001_041E',
     'B01001_042E', 'B01001_043E'], # 50-64
    ['B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E',
     'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E'] # 65+
]

stats_meta = {
    'median_age': {
        'total_field': 'B01003_001E',
        'target_fields': ['B01002_001E',],
        'make_percent': False
    },
    'percent_age_0_17': {
        'total_field': 'B01003_001E',
        'target_fields': age_vars[0],
        'make_percent': True
    },
    'percent_age_18_34': {
        'total_field': 'B01003_001E',
        'target_fields': age_vars[1],
        'make_percent': True
    },
    'percent_age_35_49': {
        'total_field': 'B01003_001E',
        'target_fields': age_vars[2],
        'make_percent': True
    },
    'percent_age_50_64': {
        'total_field': 'B01003_001E',
        'target_fields': age_vars[3],
        'make_percent': True
    },
    'percent_age_65_up': {
        'total_field': 'B01003_001E',
        'target_fields': age_vars[4],
        'make_percent': True
    },    
    'per_capita_income': {
        'total_field': 'B01003_001E',
        'target_fields': ['B19301_001E',],
        'make_percent': False
    },
    'median_household_income': {
        'total_field': 'B19301_001E',
        'target_fields': ['B19013_001E',],
        'make_percent': False
    },
    'median_gross_rent': {
        'total_field': 'B25001_001E',
        'target_fields': ['B25064_001E',],
        'make_percent': False
    },
    'median_home_value_owner_occupied': {
        'total_field': 'B25001_001E',
        'target_fields': ['B25077_001E'],
        'make_percent': False
    },
    'percent_homes_vacant': {
        'total_field': 'B25002_001E',
        'target_fields': ['B25002_003E',],
        'make_percent': True
    },
    'percent_associate_degree': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_021E',],
        'make_percent': True
    },
    'percent_bachelors_degree': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_022E',],
        'make_percent': True
    },
    'percent_masters_degree': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_023E',],
        'make_percent': True
    },
    'percent_professional_degree': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_024E',],
        'make_percent': True
    },
    'percent_doctoral_degree': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_025E',],
        'make_percent': True
    },
    'percent_bachelors_or_higher': {
        'total_field': 'B15003_001E',
        'target_fields': ['B15003_022E', 'B15003_023E', 'B15003_024E', 'B15003_025E'],
        'make_percent': True
    }
}

# Compute percent-based variables
for key in stats_meta.keys():
    if stats_meta[key]['make_percent']:
        total_field = stats_meta[key]['total_field']
        target_fields = stats_meta[key]['target_fields']
        horizontal_sum = tract_df[target_fields[0]].copy()
        for field in target_fields[1:]:
            horizontal_sum += tract_df[field]
        percent = horizontal_sum / tract_df[total_field]
        tract_df[key] = percent

In [72]:
pickle.dump(tract_df, open('../pickles/census_data.p', 'wb'))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vec = TfidfVectorizer(analyzer='word',
                         ngram_range=(1,2),
                         stop_words='english')
features=tf_vec.fit_transform(listing_df.description)

In [253]:
# Show which features were important to the model
feat_imp = pd.DataFrame([X.columns, model.feature_importances_]).transpose()
feat_imp.columns = ['variable', 'importance']
feat_imp.sort_values('importance', ascending=False)

Unnamed: 0,variable,importance
44,bedrooms,0.458368
43,bathrooms,0.0831913
58,B19301_001E,0.079137
42,availability_365,0.0360492
92,room_type_Entire home/apt,0.0289615
50,longitude,0.0271364
0,accommodates,0.0270896
49,latitude,0.0224612
60,B25064_001E,0.0222969
46,host_experience_days,0.0199131
