In [1]:
import os , tarfile
from six.moves import urllib
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    

    
def load_housing_data(data_path = HOUSING_PATH):
    csv_path = os.path.join(data_path, "housing.csv")
    return pd.read_csv(csv_path)



def stratified_split(data, test_size=0.2):
    
    #data['income_cat'] has the bin number of the binning created for median income
    data['income_cat'] = pd.cut(data['median_income'], bins=[0., 1.5, 3., 4.5, 6., np.inf],
                               labels=[1,2,3,4,5])
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data, data["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
        
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)
        
    return strat_train_set, strat_test_set

In [5]:
# A custom transformer 

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:,household_ix] 
        population_per_household = X[:,population_ix] / X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)


In [19]:
housing = load_housing_data()
#print(housing.shape)
#train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

train_set, test_set = stratified_split(housing, test_size=0.2)
   

housing = train_set.drop("median_house_value",axis=1)
housing_labels = train_set["median_house_value"].copy()

test_housing = test_set.drop("median_house_value", axis=1)
test_housing_labels= test_set["median_house_value"].copy()

num_attr = list(housing.drop("ocean_proximity",axis=1))
cat_attr = ["ocean_proximity"]


In [28]:
# Build Seperate transformation Pipelines for Numerical and Categoorical Data

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                         ('attr_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scalar', StandardScaler()),
                        ])

num_attr = list(housing_num)
cat_attr = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
                                    ("num", num_pipeline, num_attr),
                                    ("cat", OneHotEncoder(), cat_attr),    
                                  ])

housing_prepared = full_pipeline.fit_transform(housing)


In [None]:
# perform a parameter search using Random Search CV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

params = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 20000),
    'gamma': expon(scale=1),
}

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=params, n_iter=50, cv=5,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

