# Prediction of median housing price 

This example includes only data preparation and training with the SVM model. Data exploration was skipped.

In [None]:
import os
import tarfile
import urllib
import pandas as pd

In [3]:
housing = pd.read_csv(r'C:\Users\saltanat.ospanova\Dropbox\AI_book_exercises\datasets\housing\housing.csv')

In [4]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
import numpy as np

Median income showed high correlation with median house value. Therefore, median income was divided into categories in order to create stratified sampling. 

In [6]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1,2,3,4,5])

In [7]:
!pip show sklearn

Name: sklearn
Version: 0.0
Summary: A set of python modules for machine learning and data mining
Home-page: https://pypi.python.org/pypi/scikit-learn/
Author: UNKNOWN
Author-email: UNKNOWN
License: 
Location: c:\users\saltanat.ospanova\appdata\local\packages\pythonsoftwarefoundation.python.3.9_qbz5n2kfra8p0\localcache\local-packages\python39\site-packages
Requires: scikit-learn
Required-by: 


In [8]:
import sys
sys.path.append(r'c:\users\saltanat.ospanova\appdata\local\packages\pythonsoftwarefoundation.python.3.9_qbz5n2kfra8p0\localcache\local-packages\python39\site-packages')

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

In [10]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [11]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [12]:
housing = strat_train_set.drop('median_house_value', axis=1).copy()
housing_labels = strat_train_set['median_house_value'].copy()

Filling missing values with attribute's median value

In [13]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [14]:
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [15]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [16]:
housing_cat = housing[['ocean_proximity']]

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [18]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [20]:
from sklearn.compose import ColumnTransformer

num_attr = list(housing_num)
cat_attr = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num_tr', num_pipeline, num_attr),
    ('cat_tr', OneHotEncoder(), cat_attr),
])

housing_prepared = full_pipeline.fit_transform(housing)

### Example of using Support Vector Machine regressor with GridSearchCV and RandomizedSearchCV.

In [21]:
from sklearn.svm import SVR

In [22]:
from sklearn.model_selection import cross_val_score

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'kernel': ['linear'], 'C': [1, 50, 150, 500, 1001]},
    {'kernel': ['rbf'], 'C':[1, 10, 150, 500, 1001, 10000],
    'gamma': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0]}
]

svr_reg = SVR()

grid = GridSearchCV(estimator=svr_reg, param_grid=param_grid, cv=5,
                       scoring='neg_mean_squared_error')

In [24]:
grid.fit(housing_prepared, housing_labels)
grid.best_params_

{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf'}

In [25]:
cvres = grid.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

112571.06378605746 {'C': 1, 'kernel': 'linear'}
73258.54197846391 {'C': 50, 'kernel': 'linear'}
71130.88663317841 {'C': 150, 'kernel': 'linear'}
70491.91398523253 {'C': 500, 'kernel': 'linear'}
70396.49208778022 {'C': 1001, 'kernel': 'linear'}
118819.34364522224 {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
118651.46094146419 {'C': 1, 'gamma': 0.05, 'kernel': 'rbf'}
118643.66544284696 {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
118837.96076770847 {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}
118898.89058474178 {'C': 1, 'gamma': 1.0, 'kernel': 'rbf'}
118937.209249996 {'C': 1, 'gamma': 5.0, 'kernel': 'rbf'}
117862.25734600889 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
116264.9204200056 {'C': 10, 'gamma': 0.05, 'kernel': 'rbf'}
116181.25173057283 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
118024.45519726495 {'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}
118591.6498917307 {'C': 10, 'gamma': 1.0, 'kernel': 'rbf'}
118913.93622504613 {'C': 10, 'gamma': 5.0, 'kernel': 'rbf'}
104979.3331516692 {'C': 150, 'gamma': 0.01

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

param_dist = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

random_cv = RandomizedSearchCV(estimator=svr_reg, param_distributions=param_dist, 
                               cv=5, scoring='neg_mean_squared_error')
random_cv.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=SVR(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000192148FD250>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001921D4634F0>,
                                        'kernel': ['linear', 'rbf']},
                   scoring='neg_mean_squared_error')

In [28]:
random_cv.best_params_

{'C': 169561.24535404285, 'gamma': 0.620965976302143, 'kernel': 'rbf'}

In [30]:
negative_mse = random_cv.best_score_
rmse = np.sqrt(-negative_mse)
rmse

56094.94787394007

### Final pipeline for data preparation and prediction

In [32]:
final_pipeline = Pipeline([
    ('prep', full_pipeline),
    ('pred', random_cv.best_estimator_)
])