In [7]:
import os
import tarfile
import joblib
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
from six.moves import urllib
from scipy.stats import expon, reciprocal

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONSTANT DATA SOURCE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets")
MODEL_PATH = os.path.join("models")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TRANSFORMERS AND FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

def load_housing_data():    
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(HOUSING_PATH)
    tgz_path = os.path.join(HOUSING_PATH, "housing.tgz")
    urllib.request.urlretrieve(HOUSING_URL, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(HOUSING_PATH)
    housing_tgz.close()
    csv_path = os.path.join(HOUSING_PATH, "housing.csv")
    return pd.read_csv(csv_path)
    
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
def choose_top_functions(arr,k):
    k = min(k,len(np.array(arr)))
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class RemoveUnnecessaryFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_importance, k = 5 ):
        self.feature_importance = feature_importance
        self.k = k
    
    def fit(self, X, y=None):
        self.feature_ind = choose_top_functions(self.feature_importance,self.k)
        return self
    
    def transform(self, X, y=None):
        return X[:,self.feature_ind]

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DATA PREPARATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

housing = load_housing_data()

housing["income_cat"] = pd.cut(housing["median_income"], bins = [0.,1.5,3.0,4.5,6,np.inf],labels=[1,2,3,4,5])
stratfold = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratfold.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set, housing):
    set_.drop("income_cat", axis=1, inplace=True)    
    
housing = strat_train_set.drop("median_house_value",axis = 1)
housing_num = housing.drop("ocean_proximity", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PIPELINES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ])
    
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESSING AND MODELLING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

housing_prepared = full_pipeline.fit_transform(housing)

svm_reg = joblib.load(MODEL_PATH+"\SVMRegressorBest.pkl")
forest_reg = joblib.load(MODEL_PATH+"\RandomForestRegressorBest.pkl")

forest_reg.fit(housing_prepared, housing_labels)
feature_importances = forest_reg.feature_importances_

transform_and_fit = Pipeline([
    ("prepare", full_pipeline),
    ('choose_top',RemoveUnnecessaryFeatures(feature_importances,6)),
    ('rand_reg_fit',forest_reg)
])

transform_and_fit.fit(housing,housing_labels)

some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", transform_and_fit.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [ 76733.33333333 291906.66666667  80556.66666667 123156.66666667]
Labels:		 [72100.0, 279600.0, 82700.0, 112500.0]
