In [7]:
# block to run tests
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data():
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(HOUSING_PATH)
    tgz_path = os.path.join(HOUSING_PATH, "housing.tgz")
    urllib.request.urlretrieve(HOUSING_URL, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(HOUSING_PATH)
    housing_tgz.close()

def load_housing_data( ):
    csv_path = os.path.join(HOUSING_PATH, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"], bins = [0.,1.5,3.0,4.5,6,np.inf],labels=[1,2,3,4,5])
stratfold = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratfold.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set, housing):
    set_.drop("income_cat", axis=1, inplace=True)    
    
housing = strat_train_set.drop("median_house_value",axis = 1)

housing_labels = strat_train_set["median_house_value"].copy()

In [9]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

housing_tr = pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns)

SimpleImputer(strategy='median')

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(lin_mse)

LinearRegression()

68627.87390018745

In [13]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svm_reg = SVR()
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(lin_mse)

SVR()

118578.69234925653

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'kernel': ['linear'], 'C': [3000., 10000., 30000.0]},
{'kernel': ['rbf'], 'C': [ 100., 300., 1000.0],'gamma': [0.3, 1.0, 3.0]},
]

grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [3000.0, 10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [100.0, 300.0, 1000.0], 'gamma': [0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             return_train_score=True, scoring='neg_mean_squared_error')

{'C': 30000.0, 'kernel': 'linear'}

SVR(C=30000.0, kernel='linear')

70323.59018383716 {'C': 3000.0, 'kernel': 'linear'}
70292.42766659604 {'C': 10000.0, 'kernel': 'linear'}
70286.61835383571 {'C': 30000.0, 'kernel': 'linear'}
106511.75079097725 {'C': 100.0, 'gamma': 0.3, 'kernel': 'rbf'}
115840.14747601148 {'C': 100.0, 'gamma': 1.0, 'kernel': 'rbf'}
118403.2105798184 {'C': 100.0, 'gamma': 3.0, 'kernel': 'rbf'}
95226.92638084691 {'C': 300.0, 'gamma': 0.3, 'kernel': 'rbf'}
110694.78952746006 {'C': 300.0, 'gamma': 1.0, 'kernel': 'rbf'}
117422.27876448397 {'C': 300.0, 'gamma': 3.0, 'kernel': 'rbf'}
81560.7141584214 {'C': 1000.0, 'gamma': 0.3, 'kernel': 'rbf'}
100822.48562769855 {'C': 1000.0, 'gamma': 1.0, 'kernel': 'rbf'}
114293.72754944804 {'C': 1000.0, 'gamma': 3.0, 'kernel': 'rbf'}
