In [4]:
import pandas as pd
# loaded the csv file into a variable
housing = pd.read_csv("../data/housing.csv")
# to print the first 5 rows of the dataset
print(housing.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [None]:
# to get the dataypes and other metadata of the dataset
housing.info()

In [None]:
# ocean_proximity values are only text objects and also seems repetitive, so we'll try to categorize them
housing["ocean_proximity"].value_counts()

In [None]:
# now we'll use describe() method to summerize the numerical attributes
housing.describe()

In [None]:
# created a histogram out of the dataset using matplotlib's hist() method
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20, 15))
plt.show

In [None]:
# create a test set
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
# run the test set
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
# length of train_set
len(train_set)

In [None]:
# length of test_set
len(test_set)

In [None]:
# solution to use each instance's identifier to decide whether or not it should go 
# in the test set (assuming instances have a unique and immutable identifier)

from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


In [None]:
housing_with_id = housing.reset_index()   #adds an 'index' column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels = [1, 2, 3, 4, 5])

# histogram of income categories
housing["income_cat"].hist()

In [None]:
# stratified sampling based on income category
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
# stratified sampling
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# will remove the income_cat attribute so the data is back to its original state
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

In [None]:
# create a copy so that we can play without harming the training set
housing = strat_train_set.copy()

In [None]:
# we have latitiude and longitude, so create a scatterplot of all districts to visualize data
housing.plot(kind = "scatter", x = "longitude", y = "latitude")

In [None]:
# or may use the alpha option ot make it much easier ot visualize the places where is a high density of data points
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.1)

In [None]:
# will create a pre-defined color map (option cmap) called jet
# radius of circles -> population (option s)
# color -> price (option c)
# colors -> blue (low value) to red (high value)

housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4,
             s = housing["population"]/100, label = "population", figsize = (10, 7),
             c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True
            )
plt.legend()

In [None]:
# computing the standard correlation cofficient b/w every pair of attributes using the corr() method
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
# another way to check for correlation b/w attributes is to use panda's scatter_matrix function

from pandas.plotting import scatter_matrix
attributes = ["median_house_values", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize = (12, 8))

In [None]:
# correlation scatterplot of median income vs median house value
housing.plot(kind = "scatter", x = "median_income", y = "median_house_value", alpha = 0.1)

In [None]:
# preparing the data for machine learning algorithm
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per-room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_households"] = housing["population"] / housing["households"]

In [None]:
# checking correlation matrix again
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
# create a copy of the data so that it does not affect stat_train_set
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# data cleaning
# total_bedrooms attribute have some missing data, so we have 3 options
# 1. get rid of the corresponding districts
# 2. get rid of the whole attribute
# 3. set the values to some value (zero, the mean, the median, etc.)

# housing.dropna(subset = ["total_bedrooms"])  #option 1
housing.drop("total_bedrooms", axis = 1)  #option 2
# median = housing["total_bedrooms"].median()  #option 3
# housing["total_bedrooms"].fillna(median, inplace = True)

In [None]:
# when going with option 3, we need to create a SimpleImputer instance,
# specifying that you want to replace each attribute's missing values with the median of that attribute

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")

# since median can only be computed on  numerical values,
# we need to create a copy of the data without the text attribute ocean_proximity

housing_num  = housing.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num)

# imputer will simply calculate the median of aeach attribute and store the result in its statistics_instance variable
# right now only total_bedrooms attribute have missing values, but for future need after the system goes live, it is safer to impute to all the numerical attributes
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
# now we can use this 'trained' imputer to transform the training set by replacing missing values by the learned medians

X = imputer.transform(housing_num)

# the result is a plain NumPy array containing thr transformed features
# if you want to put it back into a Pandas DataFrame, so run the below code - 

# housing_tr = pd.DataFrame(X ,columns = housing_num.colums)

In [1]:
# we left out the categorical attribute ocean_proximity becuz it is a text attribute and we can not compute its median
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

NameError: name 'housing' is not defined

In [2]:
# most ML algorithms prefer to work with numbers anyway
# so lets convert these categories from text to numbers
# we'll use Scikit-Learn's OrdinalÄ“ncoder class
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [None]:
# converted text attribute to numbers
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
# you can get the list of categories using the categories_instance variable
# it is a list containing 1-D array of categories for each categorical attribute
ordinal_encoder.categories_

In [None]:
# one issue with this representation is that ML algorithms will assume that two nearby values are more similar than two distant values
# like 0 and 4 are more similar than 0 and 1
# solution -> one-hot encoding means to create onen binary attribute per category
# new attributes are called 'dummy' attributes
# Scikit-Learn provides OneHotEncoder class to convert categorical values into one-hot vectors

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

# notice that the output is a SciPy sparse matrix, instead of a NumPy array 
# that is very useful when you have categorical attributes with thousands of categories

In [None]:
# after one-hot encoding we get a matrix with thousands of categories, and matrix is full of zeroes except for single 1 per row
# using tons of memory mostly tp store zeros would be very wasteful, so instead a sparse matrix only store the location of the non-zero elements
# you can use 2-D array, but if you really want to convert it to a (dense) NumPy array, just call the toarray() method
housing_cat_1hot.toarray()

In [None]:
# once again you can get the list of categories using the encoder's categories_ instance variable
cat_encoder.categories_

In [None]:
# custom transformers
from sklearn.base import BaseEstimator, TransformerMixin

room_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def _init_(self, add_bedrooms_per_room = True):  #no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per__room
    def fit(self, X, y = None):
        return self   #nothing else to do
    def transform(self, X, y = None):
        rooms_per_househol = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ixx]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[x, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_perhousehold]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
# pipeline for the numerical attributes using Pipline class

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [3]:
# create single transformer which is able to handle all columns, applying the appropriate transformations to each column
# we'll use the ColumnTransformer apply to all the transformations to the housing data

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

NameError: name 'housing_num' is not defined

In [None]:
# select and train a model
# first train a liner regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# working linear regression model
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

In [None]:
# measuring the regresion model's RMSE on the whole training set using SciKit-Learn's mean_squared_error function
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_sqaured_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# train a DecisionTreeRegressor, a poweful model, capable of finding complex nonlinear relationship in the data
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labesl)

In [None]:
# evaluating the trained training set
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tre-rmse

In [None]:
# better evaluation using Cross-Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
# results of cross-validations
def display_scores(scores):
    print("Scores: ", scores),
    print("Mean: ", scores.mean()),
    print("Standard deviation: ", scores.std())

display_scores(tree_rmse_scores)

In [None]:
# compute the same scores for the linear regression model just to be sure
lin_scores = cross_val_score(lin_reg, housing_prepared, hosuing_labels, scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
# try the RandomForestRegressor model

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
[...]
forest_rmse
display_scores(forest_rmse_scores)

In [None]:
# we should save every model we experiment with, so that we can save both the hyperparameters and the trained parameters,
# as well as the cross--validation scores and perhaps the actual predictions as well
# this will allow you to easily compare scores across model types, and compare the types of errors they make
# you can easily save SciKit-Learn models by using Python's pickle module,
# or using sklearn.externs.joblib, which is more efficiet at serializing large NumPy

from sklearn.externals import joblib

joblib.dump(my_model, "my_model.pkl")

# and later
my_model_loaded = joblib.load("my_model.pkl")


In [None]:
# fine tuning
# grid search

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

gris_search = GridSearchCV(forest_reg, param_grid, cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
# get the best combination of parameters
grid_search.best_params

In [None]:
# get the best estimators directly
grid_search.best_estimator_

In [None]:
# evaluation scores are
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# analyze the best model and their errors
features_importances = grid_search.best_estimators_.feature_imporatnces_
feature_importances

In [None]:
# display the importance scores next to their corresponding attribute names
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_rooms"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(features_importances, attributes), reverse = True)

In [None]:
# evaluate your system on the test set
final_model = grid_search.best_estimator_

x_test = strat_test_set.drop("median_house_value", axis = 1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean.squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)  #evalutes to 47,730.2

In [None]:
# to find how precise this estimte is, compute a 95% confidence interval for the generalization error using scipy.stats.t.interval()

from scipy import stats

confidence = 0.95
squared_error = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc = squared_errors.mean(),
                         scale = stats.sem(squared_errors)
                        )
       )

# -------- launch, monitor and maintain your system