In [1]:
import os
from sys import argv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

%matplotlib inline

In [24]:
#read in data
housing = pd.read_csv('housing.tgz', compression='gzip', header=0, sep=',', quotechar='"')

In [25]:
#create dataframe minus last row of nulls
housing = housing.iloc[:-1, :]
housing.rename(columns = {'housing.csv': 'longitude'}, inplace=True)

In [26]:
housing_labels = housing['median_house_value']

In [27]:
housing_num = housing.drop('ocean_proximity', axis =1)

In [28]:
housing_num.drop('median_house_value', axis=1,inplace=True)

In [29]:
#replacing missing values within dataset with imputer instance
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

#fit imputer instance to dataset
imputer.fit(housing_num)
X = imputer.transform(housing_num)

In [30]:
#create dataframe of transformed numerical housing dataset
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index = list(housing.index.values))
housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462


In [31]:
#transformer class to add combined attributes
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedroooms_per_room=True):
        self.add_bedroooms_per_room = add_bedroooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:,household_ix]
        population_per_household = X[:,population_ix] / X[:,household_ix]
        if self.add_bedroooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

        

attr_adder = CombinedAttributesAdder(add_bedroooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
    

In [32]:
#class object to select attributes for transformation
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
        

In [33]:
#Class object to binarize selected attributes
from sklearn.preprocessing import LabelBinarizer
class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)
    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)
    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

In [34]:
"""create pipeline constructor by providing steps for data transformation. Num_pipeline tracsforms numerical values.
cat_pipeline transforms categorical features. full_pipeline merges to two pipelines into one"""

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelBinarizer

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']


num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizerPipelineFriendly()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])


In [35]:
#fit and transfrom data with full pipeline, last step before model fit
# strat_train_set.shape, strat_test_set.shape
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ..., 
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])

In [36]:
housing_labels.shape, housing_prepared.shape

((20640,), (20640, 16))

In [37]:
from sklearn.ensemble import RandomForestRegressor

rfr=RandomForestRegressor(max_features=6, n_estimators=30)
rfr.fit(housing_prepared, housing_labbels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [41]:
housing_predicted = rfr.predict(housing_prepared)
housing_predicted

array([ 427603.43333333,  397926.73333333,  382843.46666667, ...,
         87463.33333333,   84843.33333333,   86133.33333333])

In [46]:
mse = mean_squared_error(housing_predicted, housing_labels)
rmse = np.sqrt(mse)
rmse

18845.182183150722