SVM Regression on California housing dataset

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import os
import tarfile
from six.moves import urllib
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit

HOUSING_PATH = os.path.join("datasets", "housing")

def fetch_housing_data(housing_url, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

HOUSING_URL = ("https://raw.githubusercontent.com/ageron/"+
               "handson-ml2/master/datasets/housing/housing.tgz")
fetch_housing_data(HOUSING_URL)
data = load_housing_data()

data["income_cat"] = np.ceil(data["median_income"] / 1.5)
data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["income_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
X_raw = strat_train_set.drop("median_house_value", axis=1)
y = strat_train_set["median_house_value"].copy()

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
num_features = X_raw.drop("ocean_proximity", axis=1)
num_attribs = list(num_features)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X = full_pipeline.fit_transform(X_raw)
X_test_raw = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test = full_pipeline.transform(X_test_raw)

In [5]:
from sklearn.svm import SVR
svm_rbf_reg = SVR(max_iter=50000)
svm_sigmoid_reg = SVR(kernel="sigmoid",max_iter=50000)

In [7]:
from scipy.stats import reciprocal, uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv_rbf = RandomizedSearchCV(svm_rbf_reg, param_distributions, n_iter=10, cv=3,
                                       scoring= 'neg_mean_squared_error', random_state = 42,n_jobs=-1)
rnd_search_cv_rbf.fit(X, y)

print(f'Best Parameters for rbf kernel:{rnd_search_cv_rbf.best_params_}')
print(f'best MSE score for rbf kernel: {-rnd_search_cv_rbf.best_score_}')

Best Parameters for rbf kernel:{'C': 4.745401188473625, 'gamma': 0.07969454818643928}
best MSE score for rbf kernel: 13877012057.239176


In [8]:
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv_sigmoid = RandomizedSearchCV(svm_sigmoid_reg, param_distributions, n_iter=10, cv=3,
                                           scoring= 'neg_mean_squared_error', random_state = 42,n_jobs=-1)
rnd_search_cv_sigmoid.fit(X, y)

print(f'Best Parameters for sigmoid kernel:{rnd_search_cv_sigmoid.best_params_}')
print(f'best MSE score for sigmoid kernel: {-rnd_search_cv_sigmoid.best_score_}')

Best Parameters for sigmoid kernel:{'C': 4.745401188473625, 'gamma': 0.07969454818643928}
best MSE score for sigmoid kernel: 13744315287.792427


In [9]:
score_rbf = -rnd_search_cv_rbf.best_score_
score_sigmoid = -rnd_search_cv_sigmoid.best_score_
print("Sigmoid best estimator works better than RBF best estimator") if score_sigmoid < score_rbf else print("RBF best estimator works better than Sigmoid best estimator")


Sigmoid best estimator works better than RBF best estimator


In [11]:
from sklearn.metrics import mean_squared_error
final_model = rnd_search_cv_sigmoid.best_estimator_
final_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
print(final_rmse)


114741.28850451492
