In [11]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


%matplotlib inline

# Load the Boston housing dataset
data = pd.read_csv('E:\housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
    

print ("Boston housing dataset has {} data points with {} variables each.".format(*data.shape))

Boston housing dataset has 489 data points with 4 variables each.


In [2]:
prices_numpy = prices.values

minimum_price = prices_numpy.min()


maximum_price = prices_numpy.max()


mean_price = np.mean(prices_numpy)


median_price = np.median(prices_numpy)


std_price = prices_numpy.std()

# Show the calculated statistics
print ("Statistics for Boston housing dataset:\n")
print ("Minimum price: ${:,.2f}".format(minimum_price))
print ("Maximum price: ${:,.2f}".format(maximum_price))
print ("Mean price: ${:,.2f}".format(mean_price))
print ("Median price ${:,.2f}".format(median_price))
print ("Standard deviation of prices: ${:,.2f}".format(std_price))

Statistics for Boston housing dataset:

Minimum price: $105,000.00
Maximum price: $1,024,800.00
Mean price: $454,342.94
Median price $438,900.00
Standard deviation of prices: $165,171.13


In [3]:
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    
    score = r2_score(y_true,y_predict)
    

    return score

In [4]:

score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Model has a coefficient of determination, R^2, of 0.923.


In [5]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 2)


print ("Training and testing split was successful.")

Training and testing split was successful.


In [8]:

from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
   
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

    
    regressor = DecisionTreeRegressor(random_state=0)

   
    params = {'max_depth':np.arange(1,11)}

   
    scoring_fnc =  make_scorer(performance_metric)

    
    grid = GridSearchCV(regressor, params, cv=cv_sets, scoring=scoring_fnc)

    
    grid = grid.fit(X, y)

   
    return grid.best_estimator_


In [9]:

reg = fit_model(X_train, y_train)


print ("Parameter 'max_depth' is {0} for the optimal model.".format(reg.get_params()['max_depth']))

Parameter 'max_depth' is 4 for the optimal model.


In [19]:
client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3


for i, price in enumerate(reg.predict(client_data)):
    print ("Predicted selling price for Client {}'s home: ${:,.3f}".format(i+1, price))

Predicted selling price for Client 1's home: $415,800.000
Predicted selling price for Client 2's home: $236,478.261
Predicted selling price for Client 3's home: $888,720.000
