# Downloading the data

In [1]:
import os
import tarfile
import urllib

In [2]:
HOUSING_URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
HOUSING_PATH = os.path.join('../../data/raw')

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

In [5]:
import pandas as pd

In [6]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [7]:
housing = load_housing_data()

# Creating the training & testing set

In [8]:
import numpy as np

In [9]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [11]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Preparing the Data

In [12]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [13]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [16]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

# Training the Model

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR

In [18]:
param_grid = [
    {"kernel": ["rbf"], "gamma": [1e-1, 1e-2, 1e-3, 1e-4], "C": [1, 10, 100, 1000]},
    {"kernel": ["linear"], "C": [1, 10, 100, 1000, 10000]},
]

svm_reg = SVR()

random_search = RandomizedSearchCV(svm_reg, param_grid, cv=5, n_iter=12,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           random_state=0)

random_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=12,
                   param_distributions=[{'C': [1, 10, 100, 1000],
                                         'gamma': [0.1, 0.01, 0.001, 0.0001],
                                         'kernel': ['rbf']},
                                        {'C': [1, 10, 100, 1000, 10000],
                                         'kernel': ['linear']}],
                   random_state=0, return_train_score=True,
                   scoring='neg_mean_squared_error')

In [19]:
random_search.best_params_

{'kernel': 'linear', 'C': 10000}

In [20]:
random_search.best_estimator_

SVR(C=10000, kernel='linear')

In [21]:
cvres = random_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

98574.68232621568 {'kernel': 'rbf', 'gamma': 0.1, 'C': 100}
78409.42560534432 {'kernel': 'rbf', 'gamma': 0.01, 'C': 1000}
70292.4276504136 {'kernel': 'linear', 'C': 10000}
118819.34364522224 {'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
118792.37077576606 {'kernel': 'rbf', 'gamma': 0.0001, 'C': 100}
117620.69007715455 {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
107354.76152040453 {'kernel': 'rbf', 'gamma': 0.001, 'C': 1000}
71635.55360917222 {'kernel': 'linear', 'C': 100}
118795.07721129213 {'kernel': 'rbf', 'gamma': 0.001, 'C': 10}
70396.4975696743 {'kernel': 'linear', 'C': 1000}
116181.25173057283 {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
118924.31070087965 {'kernel': 'rbf', 'gamma': 0.001, 'C': 1}


In [22]:
from sklearn.metrics import mean_squared_error

final_model = random_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) 

In [23]:
final_rmse

68229.03414740493

In [24]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

array([65818.77417793, 70557.00650601])