In [47]:
from importnb import Notebook
with Notebook():
    import Housing_visual

In [48]:
# Housing_visual.housing = Housing_test_data.strat_train_set.copy()
housing = Housing_visual.housing.drop("median_house_value", axis=1)
housing_label = Housing_visual.housing["median_house_value"].copy()
print(housing_label)

12655     72100.0
15502    279600.0
2908      82700.0
14053    112500.0
20496    238300.0
           ...   
15174    268500.0
12661     90400.0
19263    140400.0
19140    258100.0
19773     62700.0
Name: median_house_value, Length: 16512, dtype: float64


In [49]:
# housing.dropna(subset=["total_bedrooms"])   # option 1
# housing.drop("total_bedrooms", axis=1)      # option 2
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median)    # option 3

12655     797.0
15502     855.0
2908      310.0
14053     519.0
20496     646.0
          ...  
15174    1231.0
12661    1422.0
19263     166.0
19140     580.0
19773     222.0
Name: total_bedrooms, Length: 16512, dtype: float64

In [50]:
from sklearn. impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [51]:
# remove the column 'ocean_proximity since it's a string and median can only be computed on numbers
housing_num = housing.drop("ocean_proximity", axis=1)

In [52]:
# The imputer computes the median of each attribute and stores the result in it's statistics_ instance variable
imputer.fit(housing_num)

In [53]:
imputer.statistics_

array([-1.18510000e+02,  3.42600000e+01,  2.90000000e+01,  2.11900000e+03,
        4.33000000e+02,  1.16400000e+03,  4.08000000e+02,  3.54155000e+00,
        5.23234164e+00,  2.03027043e-01,  2.81766108e+00])

In [54]:
housing_num.median().values

array([-1.18510000e+02,  3.42600000e+01,  2.90000000e+01,  2.11900000e+03,
        4.33000000e+02,  1.16400000e+03,  4.08000000e+02,  3.54155000e+00,
        5.23234164e+00,  2.03027043e-01,  2.81766108e+00])

In [55]:
# Transform the training set by replacing missing values by the learned medians
X = imputer.transform(housing_num)

### all transformers also have a convenience method called fit_transform() which is equivalent to calling fit() and then transform()
## fit_tranform() sometimes can be faster and more optimized than calling fit() and then transform()
# housing_tr = imputer.fit_transform(housing_num)

In [56]:
# Transform the result back to a pandas DataFrame from a numpy array
housing_tr = Housing_visual.Housing_test_data.Housing.pd.DataFrame(X, columns=housing_num.columns)

In [57]:
# Convert the text labels to numbers

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

array([1, 4, 1, ..., 0, 0, 1])

In [58]:
encoder.classes_

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

In [59]:
# Convert the integer categorical values into one-hot vectors
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16512 stored elements and shape (16512, 5)>

In [60]:
housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [61]:
# Both transformations from text categories to integer categories and then to one-hot vectors can be done in one shot using the LabelBinarizer class

from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer() # can get a sparse matrix by setting sparse_output=True LabelBinarizer(sparse_output=True)
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [62]:
# Custom Transformers
# Create a custom transformer to add the combined attributes

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return Housing_visual.Housing_test_data.np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return Housing_visual.Housing_test_data.np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [63]:
### Feature Scaling
## Two common ways to get all attributes to have the same scale: min-max scaling and standardization

# Min-max scaling (normalization): values are shifted and rescaled so they range from 0 to 1 (value - min /max - min)
# Scikit-Learn provides a transformer called MinMaxScaler for this

# Standardization: subtracts the mean value and then divides by the variance so that the resulting distribution has unit variance, 
# Standardization is less affected by outliers than min-max scaling (value - mean / variance)
# Scikit-Learn provides a transformer called StandardScaler for standardization

In [64]:
#Pipelines Imputer, customer transformer(CombinedAttributesAdder) and scaling Standardization (std_scaler)
# A Pipeline takes a list of transformer, all except from the last which must be a scaler 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [65]:
# Custom transformer DataFrameSelector

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [66]:
# DataFrameSelector(["ocean_proximity"])
# test_pipeline = Pipeline([
#     ('selector', DataFrameSelector(["ocean_proximity"])),
#     ('one_hot_encoder', OneHotEncoder())
# ])
# test_result = test_pipeline.fit_transform(housing)
# print(test_result)

In [67]:
# Combine multiple Pipelines using FeatureUnion

from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder())
    # ('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 247680 stored elements and shape (16512, 19)>
  Coords	Values
  (0, 0)	-0.9413504586000941
  (0, 1)	1.347438216815126
  (0, 2)	0.02756357138483158
  (0, 3)	0.5847774454783182
  (0, 4)	0.6403712747566713
  (0, 5)	0.7326023581928217
  (0, 6)	0.556286018753369
  (0, 7)	-0.8936472017581817
  (0, 8)	0.017395255354801475
  (0, 9)	-0.12248362298913629
  (0, 10)	0.006222642111402968
  (0, 11)	0.017395255354801475
  (0, 12)	0.006222642111402968
  (0, 13)	-0.12112176143791016
  (0, 15)	1.0
  (1, 0)	1.1717821162456232
  (1, 1)	-1.1924396559322872
  (1, 2)	-1.7220176265077816
  (1, 3)	1.2614666806325303
  (1, 4)	0.7815613248778976
  (1, 5)	0.5336115200296541
  (1, 6)	0.7213179906343863
  (1, 7)	1.2921680006896117
  (1, 8)	0.5692555390956567
  (1, 9)	-0.9116633334065556
  :	:
  (16510, 5)	-0.18974706734726624
  (16510, 6)	0.010615789146779494
  (16510, 7)	0.16826095481203124
  (16510, 8)	0.3281489106555445
  (16510, 9)	-0.506808489907879

In [69]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())