In [1]:
# block to run tests
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data():
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(HOUSING_PATH)
    tgz_path = os.path.join(HOUSING_PATH, "housing.tgz")
    urllib.request.urlretrieve(HOUSING_URL, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(HOUSING_PATH)
    housing_tgz.close()

def load_housing_data( ):
    csv_path = os.path.join(HOUSING_PATH, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"], bins = [0.,1.5,3.0,4.5,6,np.inf],labels=[1,2,3,4,5])
stratfold = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratfold.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set, housing):
    set_.drop("income_cat", axis=1, inplace=True)    
    
housing = strat_train_set.drop("median_house_value",axis = 1)

housing_labels = strat_train_set["median_house_value"].copy()

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

housing_tr = pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns)

SimpleImputer(strategy='median')

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [6]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error

# lin_reg = LinearRegression()
# lin_reg.fit(housing_prepared, housing_labels)
# housing_predictions = lin_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels, housing_predictions)
# np.sqrt(lin_mse)

RandomForestRegressor(max_features=6, n_estimators=30)

NameError: name 'mean_squared_error' is not defined

In [7]:
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error

# svm_reg = SVR()
# svm_reg.fit(housing_prepared, housing_labels)
# housing_predictions = svm_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels, housing_predictions)
# np.sqrt(lin_mse)

In [12]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import expon, reciprocal

# param_grid = {
#         'kernel': ['linear', 'rbf'],
#         'C': reciprocal(20, 200000),
#         'gamma': expon(scale=1.0),
#     }


# grid_search = RandomizedSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', 
#                                  return_train_score=True, verbose=20)
# grid_search.fit(housing_prepared, housing_labels)

# grid_search.best_params_
# grid_search.best_estimator_
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START C=50204.96160909469, gamma=0.9646886306031986, kernel=linear
[CV 1/5; 1/10] END C=50204.96160909469, gamma=0.9646886306031986, kernel=linear;, score=(train=-4955815351.814, test=-4820150425.236) total time=  46.5s
[CV 2/5; 1/10] START C=50204.96160909469, gamma=0.9646886306031986, kernel=linear
[CV 2/5; 1/10] END C=50204.96160909469, gamma=0.9646886306031986, kernel=linear;, score=(train=-4990046644.620, test=-4710830944.600) total time=  47.2s
[CV 3/5; 1/10] START C=50204.96160909469, gamma=0.9646886306031986, kernel=linear
[CV 3/5; 1/10] END C=50204.96160909469, gamma=0.9646886306031986, kernel=linear;, score=(train=-4901680131.295, test=-4979948463.147) total time=  44.0s
[CV 4/5; 1/10] START C=50204.96160909469, gamma=0.9646886306031986, kernel=linear
[CV 4/5; 1/10] END C=50204.96160909469, gamma=0.9646886306031986, kernel=linear;, score=(train=-4822268373.378, test=-5396907274.995) total time=  47.4s

[CV 1/5; 8/10] END C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear;, score=(train=-4960809351.662, test=-4847493461.120) total time=  12.8s
[CV 2/5; 8/10] START C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear
[CV 2/5; 8/10] END C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear;, score=(train=-5007194870.538, test=-4712777013.467) total time=  12.4s
[CV 3/5; 8/10] START C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear
[CV 3/5; 8/10] END C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear;, score=(train=-4915463022.328, test=-5004132489.319) total time=  13.4s
[CV 4/5; 8/10] START C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear
[CV 4/5; 8/10] END C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear;, score=(train=-4849973373.597, test=-5389655408.971) total time=  12.1s
[CV 5/5; 8/10] START C=1058.6120738652057, gamma=0.8843371089460658, kernel=linear
[CV 5/5; 8/10] END C=1058.6120738652057, gamma=0.884

RandomizedSearchCV(cv=5, estimator=SVR(C=30000.0, kernel='linear'),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000012EECDE3CA0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000012EEE1EFF40>,
                                        'kernel': ['linear', 'rbf']},
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=20)

{'C': 39637.36456854154, 'gamma': 0.2514383593052343, 'kernel': 'rbf'}

SVR(C=39637.36456854154, gamma=0.2514383593052343)

70285.63865558396 {'C': 50204.96160909469, 'gamma': 0.9646886306031986, 'kernel': 'linear'}
56770.22529525292 {'C': 39637.36456854154, 'gamma': 0.2514383593052343, 'kernel': 'rbf'}
70285.04163422027 {'C': 65492.20660491219, 'gamma': 0.31889689377250224, 'kernel': 'linear'}
104783.12771125072 {'C': 157.80753178176175, 'gamma': 0.3786103402918297, 'kernel': 'rbf'}
73133.83296233989 {'C': 51.94017973264215, 'gamma': 1.7672513911898342, 'kernel': 'linear'}
59448.63561714737 {'C': 20906.62254345822, 'gamma': 0.09287184336093555, 'kernel': 'rbf'}
70397.32777530182 {'C': 951.4431143639041, 'gamma': 1.5575111801851313, 'kernel': 'linear'}
70392.68715692266 {'C': 1058.6120738652057, 'gamma': 0.8843371089460658, 'kernel': 'linear'}
70398.34989009793 {'C': 1106.7297512177352, 'gamma': 1.0384841759526438, 'kernel': 'linear'}
70382.62416669201 {'C': 1276.0307437021293, 'gamma': 1.388005318004923, 'kernel': 'linear'}


In [14]:
# import joblib
# joblib.dump(grid_search.best_estimator_,"SVMRegressorBest.pkl")

svm_reg = joblib.load("SVMRegressorBest.pkl")

['SVMRegressorBest.pkl']

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

forest_reg = RandomForestRegressor( max_features = 6 , n_estimators = 30)
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(lin_mse)

feature_importances = forest_reg.feature_importances_
# extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

RandomForestRegressor(max_features=6, n_estimators=30)

[(0.3089359470392112, 'median_income'),
 (0.15695055588887333, 'NEAR OCEAN'),
 (0.10607923850122121, 'INLAND'),
 (0.08947646114524932, 'ISLAND'),
 (0.0777104417974525, 'longitude'),
 (0.064868769609422, 'latitude'),
 (0.06352861174579011, '<1H OCEAN'),
 (0.04108662590240457, 'housing_median_age'),
 (0.01823052977321438, 'total_rooms'),
 (0.017688133828864923, 'population'),
 (0.016504130010740126, 'NEAR BAY'),
 (0.01548457803749655, 'households'),
 (0.015443633461259753, 'total_bedrooms')]