In [1]:
from main.models import Neighborhood, Listing
from django.contrib.gis.geos import Point
import pandas as pd
import numpy as np

In [2]:
# Define data types for initial parsing of CSV
dtype = {
    'id': np.int64,
    'listing_url': str,
    'scrape_id': np.int64,
    'last_scraped': str, # date
    'name': str,
    'description': str,
    'host_id': np.int64,
    'host_name': str,
    'host_since': str, # date
    'host_is_superhost': str,
    'host_identity_verified': str,
    'latitude': np.float64,
    'longitude': np.float64,
    'neighbourhood_cleansed': str,
    'property_type': str,
    'room_type': str,
    'accommodates': np.int16,
    'bathrooms': np.float16, # contains NAN values
    'bedrooms': np.float16, # contains NAN values
    'bed_type': str,
    'amenities': str,
    'price': str, # $ - need function
    'minimum_nights': np.int16,
    'availability_365': np.int16,
    'number_of_reviews': np.int16,
    'reviews_per_month': np.float16,
    'street': str
}
     
# Specify columns that should be parsed as dates
parse_dates = ['last_scraped', 'host_since']

# Specify conversion functions for particular columns
converters = {
    'host_is_superhost': lambda x: x == 't',
    'host_identity_verified': lambda x: x == 't',
    'price': lambda x: float(x.replace('$','').replace(',','')) # convert '$1,000.00' to 1000.00
}

In [3]:
# Read CSV contents into dataframe
df = pd.read_csv(
    '../res/listings-3-aug-16.csv.gz', 
    usecols=list(dtype.keys()),
    dtype=dtype, 
    parse_dates=parse_dates, 
    converters=converters
)

In [6]:
# Check for columns containing null values
df.isnull().any()

id                        False
listing_url               False
scrape_id                 False
last_scraped              False
name                      False
description               False
host_id                   False
host_name                 False
host_since                False
host_is_superhost         False
host_identity_verified    False
street                    False
neighbourhood_cleansed    False
latitude                  False
longitude                 False
property_type             False
room_type                 False
accommodates              False
bathrooms                 False
bedrooms                  False
bed_type                  False
amenities                 False
price                     False
minimum_nights            False
availability_365          False
number_of_reviews         False
reviews_per_month         False
dtype: bool

In [5]:
# Fill certain null / missing values
df['reviews_per_month'].fillna(value=0.0, inplace=True)
df['bathrooms'].fillna(value=0.0, inplace=True)
df['bedrooms'].fillna(value=0.0, inplace=True)

# Drop entries with certain missing values
df.dropna(subset=['id', 'description', 'host_since', 'host_name'], inplace=True)

In [14]:
# Delete all existing Listings
Listing.objects.all().delete()

# Instantiate and save a Listing for each row in the dataframe
for index, row in df.iterrows():
    point = Point(x=row['longitude'], y=row['latitude'], srid=4326)
    
    new_listing = Listing(
        id = row['id'],
        name = row['name'],
        point = point,
        listing_url = row['listing_url'],
        scrape_id = row['scrape_id'],
        last_scraped = row['last_scraped'],
        description = row['description'],
        host_id = row['host_id'],
        host_name = row['host_name'],
        host_since = row['host_since'],
        host_is_superhost = row['host_is_superhost'],
        host_identity_verified = row['host_identity_verified'],
        neighborhood = Neighborhood.objects.filter(mpoly__contains=point).first(),
        neighbourhood_cleansed = row['neighbourhood_cleansed'],
        property_type = row['property_type'],
        room_type = row['room_type'],
        accommodates = row['accommodates'],
        bathrooms = row['bathrooms'],
        bedrooms = row['bedrooms'],
        bed_type = row['bed_type'],
        amenities = row['amenities'],
        price = row['price'],
        minimum_nights = row['minimum_nights'],
        availability_365 = row['availability_365'],
        number_of_reviews = row['number_of_reviews'],
        reviews_per_month = row['reviews_per_month'],
        street = row['street']
    )
    new_listing.save() # commit to DB

In [21]:
# Check the integrity of the geo lookups

# For some listings, no neighborhood contained the search point.
# How many, and which are they?
null_neighborhood_listings = Listing.objects.filter(neighborhood=None)
print(null_neighborhood_listings.count())
for l in null_neighborhood_listings:
    print(l.pk,l.name[:25], '*', l.neighbourhood_cleansed)

9
13957732 Wyndham Bonnet Creek 1 Be * Long Beach
14282984 Malibu's Finest Villa in  * Malibu
14231094 Clean, Comfortable, Great * La Mirada
14079456 No frills room Malibu bea * Malibu
13936161 The Iconic Malibu Beach C * Malibu
14238894 Sunny sailboat on the wat * Redondo Beach
14075166 Marilyn Monroes Home 180  * Malibu
13896738 Malibu La Costa Beach Hou * Malibu
13960487 Private Beach Studio Apar * Long Beach


In [26]:
bad_listing = Listing.objects.get(pk=13957732)
long_beach = Neighborhood.objects.get(
    name=bad_listing.neighbourhood_cleansed
)

print(long_beach.mpoly.centroid.distance(bad_listing.point))

0.10094054351248673


In [51]:
"""
Build and pickle a dataframe with some additional listing attributes for use in machine learning algorithms.
"""
df = pd.read_csv('../res/listings-3-aug-16.csv.gz')
extra_cols = [ 'id', 'guests_included', 'extra_people', 'review_scores_rating', 
              'review_scores_accuracy', 'review_scores_cleanliness', 
              'review_scores_checkin', 'review_scores_communication', 
              'review_scores_location', 'review_scores_value', 'cancellation_policy',
              'require_guest_phone_verification', 'require_guest_profile_picture',
              'instant_bookable'
             ]
df = df[extra_cols]

# Clean up dtypes
df['guests_included'] = df.guests_included.astype(np.float64)
df['extra_people'] = df.extra_people.apply(lambda x: np.float64(x.replace('$','')))
df['require_guest_phone_verification'] = df.require_guest_phone_verification.apply(lambda x: x == 't')
df['require_guest_profile_picture'] = df.require_guest_profile_picture.apply(lambda x: x == 't')
df['instant_bookable'] = df.instant_bookable.apply(lambda x: x == 't')
df = pd.get_dummies(df)

# Fill nulls with mean values
df = df.fillna(df.mean())

# Pickle the dataframe
import pickle
pickle.dump(df, open('../pickles/listings_extra_df.p', 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
df.id.dtype

dtype('int64')