In [1]:

import boto3 # AWS SDK 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [28]:
from matplotlib import pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')
sns.set_context('notebook', font_scale=1.5)
sns.set_palette('colorblind')
%matplotlib inline
%pylab inline

# Data Introduction

In [3]:
bucket = "realtor-data"
file_name = "realtor_data.csv"

session = boto3.Session(profile_name='simon')
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= bucket, Key= file_name) 
housing = pd.read_csv(obj['Body'])
features = ['address','longitude', 'latitude', 'interior_size', 'building_type', 'bedrooms', 'bathrooms', 'price']
housing.head()

In [4]:
housing.describe()

In [5]:
housing.info()

In [6]:
housing.building_type.unique()

In [22]:
housing.isnull().sum()

# Data Processing
I would first like to add another column called 'city' by parsing the address column.

In [8]:
housing.address.sample(10)

In [9]:
housing['city'] = housing.address.map(lambda x: x.split('|')[-1].split(',')[0].lower())
housing.city.unique()

# Exploaratory Data Analysis

In [27]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.6, c='price', cmap=plt.get_cmap('jet'), colorbar=True, figsize=(12,8))

In [11]:
# from pandas.plotting import scatter_matrix
# attributes = ['price', 'interior_size', 'bedrooms', 'bathrooms']
# scatter_matrix(housing[attributes], figsize=(20,20))

## The Most Unaffortable City?
Let us analyze which cities are the most expensive to affort a house. First, we will simply find the average price of a house for each city.

In [12]:
housing.groupby('city').price.mean().sort_values(ascending=False).plot.bar()

Well, it seems pretty clear who the winner is. However, is this the most accurate representation of affordability? One of the most important aspects people consider when looking for houses is simply the size of the interior space. Hence, let's fit a linear regression model of interior size vs. price.(p.s. I am ignoring outliers with robust option and setting confidence interval to 0 for faster calculation).

In [13]:
sns.lmplot(x='interior_size', y='price', hue='city', data=housing, height=12, robust=True, ci=None)
plt.xlim((0, 8000))
plt.ylim((0,1e7))

Joint plot shows the marginal distribution of the x (interior size) and y (price) axis with histogram or kernel density estimation.
The inner most enclosed area indicates the peak of joint density of interior size and price.

In [14]:
sns.jointplot(x='interior_size', y='price', data=housing, height=12, kind='kde')

Separating the dimensions, `sns.distplot` can overlay a histogram with kernel density estimation, getting the best of both worlds!

In [15]:
fig, ax = plt.subplots(1,2, figsize = (20,6))
sns.distplot(housing.interior_size, ax=ax[0])
sns.distplot(housing.price, ax=ax[1])

Although West Vancouver has the highest average price, when comparing the price per square foot it seems like Vancouver is actually slightly more unaffordable than West Vancouver. Let's dive deeper into some of the most expensive cities of the metropolitan area.

In [16]:
exp_cities = ['vancouver', 'west vancouver', 'richmond', 'coquitlam', 'burnaby']
exp_housing = housing[housing.city.isin(exp_cities)]

Pair plot extends the same functionality of joint plot by plotting each two dimensional pairs selected. 

In [17]:
features = ['interior_size', 'bedrooms', 'bathrooms', 'city']
sns.pairplot(exp_housing[features], height=10, hue='city')

For categorical plotting, `sns.catplot` is a great way to create boxplots and violin plots.

In [18]:
sns.catplot(y='price', x='city', kind='violin', data=exp_housing, height=12)
plt.ylim((-0.25e7, 1.5e7))

In [19]:
import plotly.express as px
fig = px.density_mapbox(housing, lat='latitude', lon='longitude', z='price',radius=5,
                        center=dict(lat=49.29225, lon=-123.14),zoom=10,
                        mapbox_style='stamen-terrain')
fig.show()