In [45]:

import os
import tarfile
import joblib
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
from six.moves import urllib
from scipy.stats import expon, reciprocal

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# block to run tests
InteractiveShell.ast_node_interactivity = "all"

In [46]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data():
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(HOUSING_PATH)
    tgz_path = os.path.join(HOUSING_PATH, "housing.tgz")
    urllib.request.urlretrieve(HOUSING_URL, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(HOUSING_PATH)
    housing_tgz.close()

def load_housing_data( ):
    csv_path = os.path.join(HOUSING_PATH, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

In [47]:
housing["income_cat"] = pd.cut(housing["median_income"], bins = [0.,1.5,3.0,4.5,6,np.inf],labels=[1,2,3,4,5])
stratfold = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratfold.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set, housing):
    set_.drop("income_cat", axis=1, inplace=True)    
    
housing = strat_train_set.drop("median_house_value",axis = 1)

housing_labels = strat_train_set["median_house_value"].copy()

In [48]:
# imputer = SimpleImputer(strategy="median")
# housing_num = housing.drop("ocean_proximity", axis=1)
# imputer.fit(housing_num)

# housing_tr = pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns)

In [49]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [50]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [51]:
# lin_reg = LinearRegression()
# lin_reg.fit(housing_prepared, housing_labels)
# housing_predictions = lin_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels, housing_predictions)
# np.sqrt(lin_mse)

In [52]:
# svm_reg = SVR()
# svm_reg.fit(housing_prepared, housing_labels)
# housing_predictions = svm_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels, housing_predictions)
# np.sqrt(lin_mse)

In [53]:
# param_grid = {
#         'kernel': ['linear', 'rbf'],
#         'C': reciprocal(20, 200000),
#         'gamma': expon(scale=1.0),
#     }

# grid_search = RandomizedSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', 
#                                  return_train_score=True, verbose=20)
# grid_search.fit(housing_prepared, housing_labels)

# grid_search.best_params_
# grid_search.best_estimator_
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [54]:
# joblib.dump(grid_search.best_estimator_,"SVMRegressorBest.pkl")
svm_reg = joblib.load("SVMRegressorBest.pkl")

In [55]:
# joblib.dump(RandomForestRegressor( max_features = 6 , n_estimators = 30),"RandomForestRegressorBest.pkl")

forest_reg = joblib.load("RandomForestRegressorBest.pkl")
forest_reg.fit(housing_prepared, housing_labels)

feature_importances = forest_reg.feature_importances_

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted_by_order = sorted(zip(feature_importances, attributes), reverse=True)
sorted_by_order

def choose_top_functions(arr,k):
    k = min(k,len(np.array(arr)))
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

np.array(attributes)[choose_top_functions(feature_importances,5)]

RandomForestRegressor(max_features=6, n_estimators=30)

[(0.31848729404458526, 'median_income'),
 (0.1534594859381008, 'INLAND'),
 (0.11094704394039175, 'pop_per_hhold'),
 (0.08933110190804996, 'bedrooms_per_room'),
 (0.07456988736723429, 'longitude'),
 (0.06722766549947996, 'latitude'),
 (0.060268651392560366, 'rooms_per_hhold'),
 (0.0440516662856758, 'housing_median_age'),
 (0.01777383895447533, 'total_rooms'),
 (0.017485696918057632, 'total_bedrooms'),
 (0.01620861917764268, 'population'),
 (0.015417751820028185, 'households'),
 (0.00860920593322481, '<1H OCEAN'),
 (0.0035232529690518966, 'NEAR OCEAN'),
 (0.0025171234837905194, 'NEAR BAY'),
 (0.00012171436765082942, 'ISLAND')]

array(['longitude', 'median_income', 'pop_per_hhold', 'bedrooms_per_room',
       'INLAND'], dtype='<U18')

In [64]:
class RemoveUnnecessaryFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_importance, k = 5 ):
        self.feature_importance = feature_importance
        self.k = k
    
    def fit(self, X, y=None):
        self.feature_ind = choose_top_functions(self.feature_importance,self.k)
        return self
    
    def transform(self, X, y=None):
        return X[:,self.feature_ind]
    
select_top = RemoveUnnecessaryFeatures(feature_importances)
select_top.fit_transform(housing_prepared)

array([[-0.94135046, -0.8936472 ,  0.00622264, -0.12112176,  1.        ],
       [ 1.17178212,  1.292168  , -0.04081077, -0.81086696,  0.        ],
       [ 0.26758118, -0.52543365, -0.07537122, -0.33827252,  1.        ],
       ...,
       [-1.5707942 , -0.36547546, -0.03743619,  0.32286937,  0.        ],
       [-1.56080303,  0.16826095, -0.05915604, -0.45702273,  0.        ],
       [-1.28105026, -0.390569  ,  0.00657083, -0.12169672,  1.        ]])