Create a workspace directory for your Machine Learning code and
datasets

In [None]:
$ export ML_PATH="$HOME/ml" # You can change the path if you prefer
$ mkdir -p $ML_PATH

Check to see if pip is installed

In [None]:
$ python3 -m pip --version

Upgrade the pip module

In [None]:
$ python3 -m pip install --user -U pip

Install all the required modules and their dependencies

In [None]:
$ python3 -m pip install -U jupyter matplotlib numpy pandas scipy scikit-learn

Register a virtualenv to Jupyter and give it a name

In [None]:
$ python3 -m ipykernel install --user --name=python3

Start up Jupyter

In [None]:
$ jupyter notebook

Fetch the data

In [None]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

Load the data using pandas

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

Find out what categories exist and how many districts belong to each category

In [None]:
>>> housing["ocean_proximity"].value_counts()

Call `hist()` to plot a histogram for each numerical attribute

In [None]:
%matplotlib inline # only in a Jupyter notebook
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

Creating a test set from 20% of the dataset

In [None]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
>>> train_set, test_set = split_train_test(housing, 0.2)
>>> len(train_set)

In [None]:
>>> len(test_set)

Compute a hash of each instance’s identifier and put that instance in the test set

In [None]:
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

Use the row index as the ID to solve housing dataset not having an identifier column

In [None]:
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

Using the most stable features to build a unique identifier if the new data doesnt get appended to the end of the dataset and that rows gets deleted

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

Other ways (functions) to split datasets into multiple subsets with `random_state` parameter setting the random generator
seed and passing it multiple datasets with an identical number of rows to split them on the same indices

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

Create an income category attribute with five categories

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

Creating histogram of income categories

In [None]:
housing["income_cat"].hist()

Do stratified sampling based on the income category

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

Looking at the income category proportions in the test set

In [None]:
>>> strat_test_set["income_cat"].value_counts() / len(strat_test_set)

Remove the `income_cat` attribute so the data is back to its original
state

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

Create a copy to play with so it
wont harm the training set

In [None]:
housing = strat_train_set.copy()

Create a scatterplot of all districts.

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

Setting the alpha option to 0.1 for easier visualization of the places
with high density data points.

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

Make the patterns stand out. The radius of each circle represents
the district’s population (option s), and the color represents the price (option c). We
will use a predefined color map (option cmap) called jet, which ranges from blue
(low values) to red (high prices).

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

Compute the standard correlation
coefficient between every pair of attributes

In [None]:
corr_matrix = housing.corr()

Look at how much each attribute correlates with the median house value

In [None]:
>>> corr_matrix["median_house_value"].sort_values(ascending=False)

Focusing on a few promising
attributes that seem most correlated with the median housing value

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

Zoom in on the correlation scatterplot of `median_income` and `median_house_value`

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

Create new attributes

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

Look at the correlation matrix

In [None]:
>>> corr_matrix = housing.corr()
>>> corr_matrix["median_house_value"].sort_values(ascending=False)

Revert to a clean training set and separate the predictors and the labels

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Taking care of missing values in `total_bedrooms` with options:
1. Get rid of the corresponding districts.
2. Get rid of the whole attribute.
3. Set the values to some value (zero, the mean, the median, etc.).

In [None]:
housing.dropna(subset=["total_bedrooms"]) # option 1
housing.drop("total_bedrooms", axis=1) # option 2
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

Scikit-Learn's handy class to take care of missing values

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

Create a copy of the data without the text attribute `ocean_proximity`

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

Fit the imputer instance to the training data

In [None]:
imputer.fit(housing_num)

Apply the imputer to all the numerical attributes

In [None]:
>>> imputer.statistics_
>>> housing_num.median().values

Use “trained” imputer to transform the training set by replacing
missing values with the learned medians

In [None]:
X = imputer.transform(housing_num)

If you want to put it back into a pandas DataFrame

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

Look at `ocean_proximity`'s value for the first 10 instances

In [None]:
>>> housing_cat = housing[["ocean_proximity"]]
>>> housing_cat.head(10)

Convert categories from text to numbers with Scikit-Learn’s `OrdinalEncoder` class

In [None]:
>>> from sklearn.preprocessing import OrdinalEncoder
>>> ordinal_encoder = OrdinalEncoder()
>>> housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
>>> housing_cat_encoded[:10]

Get the list of categories using the `categories_ instance` variable

In [None]:
>>> ordinal_encoder.categories_

Scikit-Learn's `OneHotEncoder` class to convert categorical values into one-hot vectors

In [None]:
>>> from sklearn.preprocessing import OneHotEncoder
>>> cat_encoder = OneHotEncoder()
>>> housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
>>> housing_cat_1hot

To convert it to a (dense) NumPy array

In [None]:
>>> housing_cat_1hot.toarray()

Get the list of categories

In [None]:
>>> cat_encoder.categories_

Adds combined attributes with Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

Scikit-Learn's data transformation Pipeline class for the numerical attributes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

Single transformer to handle all columns (categorical columns and the numerical columns)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

Train a Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

Try Linear Regression model on a few
instances from the training set

In [None]:
>>> some_data = housing.iloc[:5]
>>> some_labels = housing_labels.iloc[:5]
>>> some_data_prepared = full_pipeline.transform(some_data)
>>> print("Predictions:", lin_reg.predict(some_data_prepared))
>>> print("Labels:", list(some_labels))

Measure this regression model’s RMSE on the whole training

In [None]:
>>> from sklearn.metrics import mean_squared_error
>>> housing_predictions = lin_reg.predict(housing_prepared)
>>> lin_mse = mean_squared_error(housing_labels, housing_predictions)
>>> lin_rmse = np.sqrt(lin_mse)
>>> lin_rmse

Train a DecisionTreeRegressor model

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

Evaluate the trained model on the training set

In [None]:
>>> housing_predictions = tree_reg.predict(housing_prepared)
>>> tree_mse = mean_squared_error(housing_labels, housing_predictions)
>>> tree_rmse = np.sqrt(tree_mse)
>>> tree_rmse

Scikit-Learn’s K-fold cross-validation feature to evaluate the Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

Look at the results

In [None]:
>>> def display_scores(scores):
...     print("Scores:", scores)
...     print("Mean:", scores.mean())
...     print("Standard deviation:", scores.std())
...
>>> display_scores(tree_rmse_scores)

Compute the same scores for the Linear Regression model

In [None]:
>>> lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
...
>>> lin_rmse_scores = np.sqrt(-lin_scores)
>>> display_scores(lin_rmse_scores)

Train a RandomForestRegressor model

In [None]:
>>> from sklearn.ensemble import RandomForestRegressor
>>> forest_reg = RandomForestRegressor()
>>> forest_reg.fit(housing_prepared, housing_labels)
>>> [...]
>>> forest_rmse
>>> display_scores(forest_rmse_scores)

Searching for the best combination of hyperparameter values for the RandomForestRegressor

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

Get the best combination of parameters

In [None]:
>>> grid_search.best_params_

Get the best estimator directly

In [None]:
>>> grid_search.best_estimator_

Look at the evaluation scores

In [None]:
>>> cvres = grid_search.cv_results_
>>> for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
... print(np.sqrt(-mean_score), params)

Indicate the relative importance of each
attribute for making accurate predictions with RandomForestRegressor

In [None]:
>>> feature_importances = grid_search.best_estimator_.feature_importances_
>>> feature_importances

Display the scores next to their corresponding attribute names

In [None]:
>>> extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
>>> cat_encoder = full_pipeline.named_transformers_["cat"]
>>> cat_one_hot_attribs = list(cat_encoder.categories_[0])
>>> attributes = num_attribs + extra_attribs + cat_one_hot_attribs
>>> sorted(zip(feature_importances, attributes), reverse=True)

Get the predictors and the labels from your
test set, run your `full_pipeline` to transform the data, and evaluate the final model
on the test set

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) # => evaluates to 47,730.2

Compute a 95% confidence interval for the generalization error

In [None]:
>>> from scipy import stats
>>> confidence = 0.95
>>> squared_errors = (final_predictions - y_test) ** 2
>>> np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
... loc=squared_errors.mean(),
... scale=stats.sem(squared_errors)))