In [2]:
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Reading Data

In [3]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing=load_housing_data()

Creating Stratified Train Test Split based on Household median income 

In [4]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [5]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Below Imputes all the numerical variables with the median, 3 additional transformed variables and categorical variable into dummy variables 

In [6]:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)



num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num = housing.drop("ocean_proximity", axis=1)
housing_num_tr = num_pipeline.fit_transform(housing_num)


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [7]:

svm_ = SVR(kernel='linear')
svm_.fit(housing_prepared, housing_labels)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


In [9]:
housing_predictions = svm_.predict(housing_prepared)
scores = cross_val_score(svm_, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-scores)
display_scores(svm_rmse_scores) 

Scores: [105342.09141998 112489.24624123 110092.35042753 113403.22892482
 110638.90119657 115675.8320024  110703.56887243 114476.89008206
 113756.17971227 111520.1120808 ]
Mean: 111809.84009600841
Standard deviation: 2762.393664321567


In [10]:
joblib.dump(svm_, "svm_.pkl")

['svm_.pkl']

In [12]:
parameters = {'kernel': ('linear', 'rbf'), 'C': range(1, 10)}
svm_ = SVR(gamma='auto')
clf = RandomizedSearchCV(svm_, parameters,n_iter=5, random_state=4, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
clf.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=5, n_jobs=None,
                   param_distributions={'C': range(1, 10),
                                        'kernel': ('linear', 'rbf')},
                   pre_dispatch='2*n_jobs', random_state=4, refit=True,
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=0)

In [13]:
clf.best_params_

{'kernel': 'linear', 'C': 9}

In [14]:
clf.best_estimator_

SVR(C=9, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

Full pipeline - Data prep and Scoring 

In [15]:


parameters = {'kernel': ('linear', 'rbf'), 'C': range(1, 10)}
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
t=  [("cat", OneHotEncoder(), cat_attribs), ("num", num_pipeline, num_attribs)]
col_transform = ColumnTransformer(transformers=t)
svm_ = SVR(gamma='auto')
clf = RandomizedSearchCV(svm_, parameters,n_iter=5, random_state=4, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
pipeline = Pipeline(steps=[('prep',col_transform), ('m', clf)])
scores = cross_val_score(pipeline, housing,housing_labels, scoring='neg_mean_absolute_error',cv=5)
Final_rmse_scores = np.sqrt(-scores)
display_scores(Final_rmse_scores)   



Scores: [242.85444128 245.30843271 248.25687042 245.73437052 246.53136196]
Mean: 245.73709537963646
Standard deviation: 1.7591804561639326
