In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import os
import tarfile
import urllib
import pandas as pd
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [12]:
### Getting housing data

In [9]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [10]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [11]:
fetch_housing_data()
housing = load_housing_data()

In [14]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [15]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [16]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [17]:
housing = strat_train_set.copy()

In [18]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [19]:


rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CustomAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True, add_rooms_per_household = True, add_population_per_household=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household

    # The fit method just returns self
    def fit(self, X, y=None):
        return self

    # Using the transform method attributes which are required can be added
    def transform(self, X):
        new_attri= X
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            new_attri= np.c_[new_attri, bedrooms_per_room]
            print(new_attri)
        if self.add_rooms_per_household:
            rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
            new_attri= np.c_[new_attri, rooms_per_household]

        if self.add_population_per_household:
            population_per_household = X[:, population_ix] / X[:, households_ix]
            new_attri= np.c_[new_attri, population_per_household]
        return new_attri
new_attri = CustomAttributesAdder()
housing_new_attribs = new_attri.transform(housing.values)

[[-121.46 38.52 29.0 ... 2.1736 'INLAND' 0.20578363026077975]
 [-117.23 33.09 7.0 ... 6.3373 'NEAR OCEAN' 0.16071428571428573]
 [-119.04 35.37 44.0 ... 2.875 'INLAND' 0.1915945611866502]
 ...
 [-122.72 38.44 48.0 ... 3.1797 '<1H OCEAN' 0.2347949080622348]
 [-122.7 38.31 14.0 ... 4.1964 '<1H OCEAN' 0.1838351822503962]
 [-122.14 39.97 27.0 ... 3.1319 'INLAND' 0.2057460611677479]]


In [20]:
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CustomAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

[[-1.21460000e+02  3.85200000e+01  2.90000000e+01 ...  7.06000000e+02
   2.17360000e+00  2.05783630e-01]
 [-1.17230000e+02  3.30900000e+01  7.00000000e+00 ...  7.68000000e+02
   6.33730000e+00  1.60714286e-01]
 [-1.19040000e+02  3.53700000e+01  4.40000000e+01 ...  3.00000000e+02
   2.87500000e+00  1.91594561e-01]
 ...
 [-1.22720000e+02  3.84400000e+01  4.80000000e+01 ...  1.72000000e+02
   3.17970000e+00  2.34794908e-01]
 [-1.22700000e+02  3.83100000e+01  1.40000000e+01 ...  5.01000000e+02
   4.19640000e+00  1.83835182e-01]
 [-1.22140000e+02  3.99700000e+01  2.70000000e+01 ...  1.97000000e+02
   3.13190000e+00  2.05746061e-01]]


In [22]:
num_attribs = list(housing_num.columns)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

[[-1.21460000e+02  3.85200000e+01  2.90000000e+01 ...  7.06000000e+02
   2.17360000e+00  2.05783630e-01]
 [-1.17230000e+02  3.30900000e+01  7.00000000e+00 ...  7.68000000e+02
   6.33730000e+00  1.60714286e-01]
 [-1.19040000e+02  3.53700000e+01  4.40000000e+01 ...  3.00000000e+02
   2.87500000e+00  1.91594561e-01]
 ...
 [-1.22720000e+02  3.84400000e+01  4.80000000e+01 ...  1.72000000e+02
   3.17970000e+00  2.34794908e-01]
 [-1.22700000e+02  3.83100000e+01  1.40000000e+01 ...  5.01000000e+02
   4.19640000e+00  1.83835182e-01]
 [-1.22140000e+02  3.99700000e+01  2.70000000e+01 ...  1.97000000e+02
   3.13190000e+00  2.05746061e-01]]


In [30]:
def indices_of_top_k(arr, k):
    return np.argsort(arr)[-k:]

class SelectTopFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [27]:
#  From exercise 2
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
k = 5
best_params = {'C': 98.42308858067881, 'gamma': 4.677628932479799, 'kernel': 'linear'}
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([8.60123366e-02, 8.35422601e-02, 4.23754452e-02, 2.51648530e-02,
       2.00209509e-02, 2.39123042e-02, 2.16166916e-02, 2.93320246e-01,
       6.54402507e-02, 7.36560241e-02, 9.26651000e-02, 2.36989994e-02,
       1.31439521e-01, 8.03835862e-05, 8.02497003e-03, 9.02966310e-03])

In [31]:
# The below pipeline combines the transformations for numerical and categorical pipelines.
full_pipeline = ColumnTransformer([
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('custom_attribs_adder', CustomAttributesAdder()),
            ('std_scaler', StandardScaler()),]), num_attribs),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ])
housing_prepared_new = full_pipeline.fit_transform(housing)

[[-1.21460000e+02  3.85200000e+01  2.90000000e+01 ...  7.06000000e+02
   2.17360000e+00  2.05783630e-01]
 [-1.17230000e+02  3.30900000e+01  7.00000000e+00 ...  7.68000000e+02
   6.33730000e+00  1.60714286e-01]
 [-1.19040000e+02  3.53700000e+01  4.40000000e+01 ...  3.00000000e+02
   2.87500000e+00  1.91594561e-01]
 ...
 [-1.22720000e+02  3.84400000e+01  4.80000000e+01 ...  1.72000000e+02
   3.17970000e+00  2.34794908e-01]
 [-1.22700000e+02  3.83100000e+01  1.40000000e+01 ...  5.01000000e+02
   4.19640000e+00  1.83835182e-01]
 [-1.22140000e+02  3.99700000e+01  2.70000000e+01 ...  1.97000000e+02
   3.13190000e+00  2.05746061e-01]]


In [33]:
combined_pipeline= Pipeline([
    ('data_preparation', full_pipeline),
    ('feature_selection', SelectTopFeatures(feature_importances, k)),
    ('svm_reg', SVR()),
])

# Setting the parameters for SVM which was got from randomised search
combined_pipeline.named_steps['svm_reg'].set_params(**best_params)
combined_pipeline.fit(housing, housing_labels)

[[-1.21460000e+02  3.85200000e+01  2.90000000e+01 ...  7.06000000e+02
   2.17360000e+00  2.05783630e-01]
 [-1.17230000e+02  3.30900000e+01  7.00000000e+00 ...  7.68000000e+02
   6.33730000e+00  1.60714286e-01]
 [-1.19040000e+02  3.53700000e+01  4.40000000e+01 ...  3.00000000e+02
   2.87500000e+00  1.91594561e-01]
 ...
 [-1.22720000e+02  3.84400000e+01  4.80000000e+01 ...  1.72000000e+02
   3.17970000e+00  2.34794908e-01]
 [-1.22700000e+02  3.83100000e+01  1.40000000e+01 ...  5.01000000e+02
   4.19640000e+00  1.83835182e-01]
 [-1.22140000e+02  3.99700000e+01  2.70000000e+01 ...  1.97000000e+02
   3.13190000e+00  2.05746061e-01]]


In [36]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [35]:
predictions = combined_pipeline.predict(housing)
print(np.sqrt(mean_squared_error(housing_labels, predictions)))

[[-1.21460000e+02  3.85200000e+01  2.90000000e+01 ...  7.06000000e+02
   2.17360000e+00  2.05783630e-01]
 [-1.17230000e+02  3.30900000e+01  7.00000000e+00 ...  7.68000000e+02
   6.33730000e+00  1.60714286e-01]
 [-1.19040000e+02  3.53700000e+01  4.40000000e+01 ...  3.00000000e+02
   2.87500000e+00  1.91594561e-01]
 ...
 [-1.22720000e+02  3.84400000e+01  4.80000000e+01 ...  1.72000000e+02
   3.17970000e+00  2.34794908e-01]
 [-1.22700000e+02  3.83100000e+01  1.40000000e+01 ...  5.01000000e+02
   4.19640000e+00  1.83835182e-01]
 [-1.22140000e+02  3.99700000e+01  2.70000000e+01 ...  1.97000000e+02
   3.13190000e+00  2.05746061e-01]]
76171.993097255
