# Start graphlab create

In [26]:
import graphlab

# Load house sales data

In [32]:
houseSales = graphlab.SFrame('home_data.gl/home_data.gl')

In [33]:
houseSales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900,3,1.0,1180,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000,3,2.25,2570,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000,2,1.0,770,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000,4,3.0,1960,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000,3,2.0,1680,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000,4,4.5,5420,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500,3,2.25,1715,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850,3,1.5,1060,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500,3,1.0,1780,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000,3,2.5,1890,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


#Exploration of housing sales data

House price is correlated with the number of square feet.

In [45]:
graphlab.canvas.set_target('ipynb')
houseSales.show(view="Scatter Plot", x="sqft_living", y="price")

# Simple regression model (sqft_living to price)

In [46]:
train_data,test_data = houseSales.random_split(.8,seed=0)

## Regression model using only sqft_living

In [47]:
sqft_model = graphlab.linear_regression.create(train_data, target='price', features=['sqft_living'],validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 1
PROGRESS: Number of unpacked features : 1
PROGRESS: Number of coefficients    : 2
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 1.004723     | 4349521.926170     | 262943.613754 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+


## Evaluation

In [48]:
print test_data['price'].mean()

543054.042563


In [None]:
print sqft_model.evaluate(test_data)

RMSE of approx $256,170

## Predictions using Matplotlib

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(test_data['sqft_living'],test_data['price'],'.',
        test_data['sqft_living'],sqft_model.predict(test_data),'-')

In [None]:
sqft_model.get('coefficients')

## Explore rest of the features in the data

In [71]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [None]:
houseSales[my_features].show()

In [70]:
houseSales.show(view='BoxWhisker Plot', x='zipcode', y='price')

## Regression model with more features

In [72]:
my_features_model = graphlab.linear_regression.create(train_data,target='price',features=my_features,validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.027083     | 3763208.270524     | 181908.848367 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+


In [73]:
print my_features

['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']


## Comparison of the results of the simple model with adding more features

In [77]:
print sqft_model.evaluate(test_data)
print my_features_model.evaluate(test_data)

{'max_error': 4143550.8825285956, 'rmse': 255191.0287052738}
{'max_error': 3486584.5093818563, 'rmse': 179542.4333126908}


The RMSE goes down from \$256,170 to \$179,608 with more features.

#Apply learned models to predict prices of 3 houses

The first house we will use is considered an "average" house in Seattle. 

In [None]:
house1 = sales[sales['id']=='5309101200']

In [None]:
house1

In [None]:
print house1['price']

In [None]:
print sqft_model.predict(house1)

In [None]:
print my_features_model.predict(house1)

In this case, the model with more features provides a worse prediction than the simpler model with only 1 feature.  However, on average, the model with more features is better.

##Prediction for a second, more elite house

We will now examine the predictions for house.

In [None]:
house2 = sales[sales['id']=='1925069082']

In [None]:
house2

In [None]:
print sqft_model.predict(house2)

In [None]:
print my_features_model.predict(house2)

In this case, the model with more features provides a better prediction.  This behavior is expected here, because this house is more differentiated by features that go beyond its square feet of living space, especially the fact that it's a waterfront house. 

##Last house

In [None]:
bill_gates = {'bedrooms':[8], 
              'bathrooms':[25], 
              'sqft_living':[50000], 
              'sqft_lot':[225000],
              'floors':[4], 
              'zipcode':['98039'], 
              'condition':[10], 
              'grade':[10],
              'waterfront':[1],
              'view':[4],
              'sqft_above':[37500],
              'sqft_basement':[12500],
              'yr_built':[1994],
              'yr_renovated':[2010],
              'lat':[47.627606],
              'long':[-122.242054],
              'sqft_living15':[5000],
              'sqft_lot15':[40000]}

In [None]:
print my_features_model.predict(graphlab.SFrame(bill_gates))

The model predicts a price of over $13M for this house

In [49]:
sales[sales['zipcode'] == '98039']['price'].mean()

2160606.6000000006

In [67]:
sales[(sales['sqft_living']>=2000) & (sales['sqft_living']<4000)].num_rows()

9214

In [68]:
sales.num_rows()

21613

In [74]:
advance_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'view', 'sqft_above', 'sqft_basement']

In [75]:
print advance_features

['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'view', 'sqft_above', 'sqft_basement']


In [76]:
advance_features_model = graphlab.linear_regression.create(train_data,target='price',features=advance_features,validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 9
PROGRESS: Number of unpacked features : 9
PROGRESS: Number of coefficients    : 118
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.038083     | 3826508.102710     | 169189.417285 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+


In [79]:
print my_features_model.evaluate(test_data);
print advance_features_model.evaluate(test_data);

{'max_error': 3486584.5093818563, 'rmse': 179542.4333126908}
{'max_error': 3235458.31003239, 'rmse': 166703.94733205126}
