## Download data

In [None]:
import os
import tarfile
import urllib.request
import numpy as np
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


def fetch_hosing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_hosing_data()

## Load the data

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head(5)

In [None]:
housing.info()

In [None]:
# print(housing['ocean_proximity'].value_counts())
housing['housing_median_age'].value_counts().head()

In [None]:
housing.describe()

In [None]:
### show in histogram (for numerical attributes)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

housing.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# to make the notebooks output identical at every run
np.random.seed(42)

In [None]:
def split_train_test(data, test_ratio=0.5):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

len(test_set)

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_set_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

test_set.head()

In [None]:
housing['median_income'].hist()
print(housing['median_income'].value_counts())
# len(housing)
nan_median_income = housing[housing['median_income'].isna()]
len(nan_median_income)

In [None]:
housing['income_cat'] = pd.cut(
    housing['median_income'],
    bins=[0., 1.5, 3.0, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5]
)

housing['income_cat'].value_counts()

In [None]:
housing['income_cat'].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]

In [None]:
strat_train_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
housing['income_cat'].value_counts() / len(housing)

In [None]:
def income_cat_proportions(data):
    return data['income_cat'].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set)
}).sort_index()

compare_props['Rand. %error'] = 100 * compare_props['Random'] / compare_props['Overall'] - 100
compare_props['Strat. %error'] = 100 * compare_props['Stratified'] / compare_props['Overall'] - 100

compare_props

In [None]:
for  set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

### Discover and Visualize data to gain insight

In [None]:
housing = strat_train_set.copy()

### Visualizing Geographical data

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude')

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
             s=housing['population']/100, label='population', figsize=(10, 7),
             c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
# Download the california image
PROJECT_ROOT_DIR = os.getcwd()
images_path = os.path.join(PROJECT_ROOT_DIR, 'images')
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
filename = 'california.png'
print('Downloading....', filename)
url = DOWNLOAD_ROOT + "images/end_to_end_project/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))


In [None]:
import matplotlib.image as mpimg

california_img = mpimg.imread(os.path.join(images_path, filename))
ax = housing.plot(kind='scatter', x='longitude', y='latitude', figsize=(10, 7),
                  s=housing['population']/100, label='Population',
                  c='median_house_value', cmap=plt.get_cmap('jet'),
                  colorbar=False, alpha=0.4)
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel('Latitude', fontsize=14)
plt.xlabel('Longitude', fontsize=14)

prices = housing['median_house_value']
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
plt.show()

#### Look for the Correlations
Correlation is a statistical measure that tells you how strongly two variables are related to each other — and in which direction (positive or negative).

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)



In [None]:
# from pandas.tools.plotting import scatter_matrix
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix (housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)
plt.axis([0, 16, 0, 550000])

#### Experimenting with attribute combinations

In [None]:
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing.plot(kind='scatter', x='rooms_per_household', y='median_house_value')
plt.axis([0, 5, 0, 520000])
plt.show()

In [None]:
housing.describe()

#### Prepare the data for ml algorihtms

In [None]:
# remove 'median_house_value' from the training set
# store it in another variables
housing = strat_train_set.drop('median_house_value', axis=1) # drop labels for training set
housing_labels = strat_train_set['median_house_value'].copy()

#### Data cleaning

To demonstrate each of them, let's create a copy of the housing dataset, but keeping only the rows that contain at least one null. Then it will be easier to visualize exactly what each option does:

In [None]:
# # Get rid of the corresponding districts.
# housing.dropna(subset=['total_bedrooms'])
# # Get rid of the whole attribute.
# housing.drop('total_bedrooms', axis=1)

# # Set the values to some value (zero, the mean, the median, etc.)
# median = housing['total_bedrooms'].median()
# housing['total_bedrooms'].fillna(median)
# median


In [None]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
# option 1  
# Get rid of the corresponding districts.
sample_incomplete_rows.dropna(subset='total_bedrooms')
sample_incomplete_rows

In [None]:
# option 2
# Get rid of the whole attribute.
# Do not drop the 'total_bedrooms' column, just display the DataFrame as is
# simple_incomplete_rows = simple_incomplete_rows.drop('total_bedrooms', axis=1)
# simple_incomplete_rows

In [None]:
# Set the values to some value (zero, the mean, the median, etc.)
median = housing['total_bedrooms'].median()
sample_incomplete_rows['total_bedrooms'].fillna(median, inplace=True)
sample_incomplete_rows

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

Remove the text attribute because median can only be calculated on numerical attributes

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

##### Transform the traning set
Now you can use this “trained” imputer to transform the training set by replacing
missing values with the learned medians:

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index)
housing_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = housing_num.index)
housing_tr.head()

#### Handling Text and Categorical Attributes

NB: 
housing['ocean_proximity'] => single third braces [] are for Series <br>
housing[['ocean_proximity']] => double third braces [[]] are DataFrame


In [None]:
housing_cat = housing[['ocean_proximity']]
# housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

##### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()

#### Custom transformers
Let's create a custom transformer to add extra attributes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attributes = attr_adder.transform(housing.values)
housing_extra_attributes