In [None]:
# define the function fetch_housing that
# downloads a csv-file with testdata from a url into a local file 

import os         # create and read local files
import tarfile    # extract from tarfiles
import requests   # download from url

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL =  DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    
    socket = requests.get(housing_url, stream=True)
    with open(tgz_path, 'wb') as fd:
        for chunk in socket.iter_content(chunk_size=128):
            fd.write(chunk)
        
    housing_tgz =tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)    # this extracts housing.csv
    housing_tgz.close()


In [None]:
fetch_housing_data()

In [None]:
# define the function load_housing
# that constructs a panda data-object from a local csv-file

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
    
    

In [None]:
# create the data-object housing and list the first 5 lines
housing = load_housing_data()
housing.head()   # shows the first lines


In [None]:
housing.info()   # shows number/types of lines/columns


In [None]:
housing["ocean_proximity"].value_counts()  # categorizes and counts values for one col

In [None]:
housing.describe()      # simple statistics on numeric values

In [None]:
# not part of the handson book:
# a first example using matplotlib
import matplotlib.pyplot as plt
import numpy as np

# Data for plotting
t = np.arange(0.0, 2.0, 0.01)
s = 1 + np.sin(2 * np.pi * t)
# Note that using plt.subplots below is equivalent to using
# fig = plt.figure() and then ax = fig.add_subplot(111)
fig, ax = plt.subplots()
ax.plot(t, s)
ax.set(xlabel='time (s)', ylabel='voltage (mV)',
       title='About as simple as it gets, folks')
# ax.grid()
# fig.savefig("test.png")
plt.show()

In [None]:
# show histograms of all numeric data in housing
housing.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
# split the dataset in training- and testset
# the following method would randomly separate a ration of data points
# where we use a constant seed to make sure we make the same choice next time
import numpy as np

def split_random(data, testRatio):
    np.random.seed(42)
    randomIndices = np.random.permutation(len(data))
    testSize = int(len(data) * testRatio)
    testIndices = randomIndices[:testSize]
    trainIndices = randomIndices[testSize:]
    return data.iloc[trainIndices], data.iloc[testIndices]

trainSet, testSet = split_random(housing, 0.2)
print(len(trainSet), "train and ", len(testSet), " test")


In [None]:
# to have a split that is also stable when part of the data is 
# deleted / completed / added it is a nice idea to base the split
# on the hash of an ID, that here we first construct from a stable attribute

import hashlib

housing_withID = housing
housing_withID["id"] = housing["longitude"]*1000 + housing["latitude"]

# a function that returns true for a given proportion of ids
# this is called by the lambda expression below to filter an array of ids
def isInTest(id, testRatio, hash):
    hashedID = hash(np.int64(id))
    return hashedID.digest()[-1] < 256 * testRatio   # compare the last byte of the hash ?

def split_byID(data, testRatio, idColumn, hash=hashlib.md5):
    ids = data[idColumn]
    testIDs = ids.apply(lambda id_ : isInTest(id_, testRatio, hash))
    # print(testIDs) # this is a Dataframe column of booleans, i.e. not
    # just a 1D array of booleans or indices, but something like:
    #  0   false
    #  1   false
    #  2   true
    #  ...
    return data.loc[-testIDs], data.loc[testIDs]
    
trainSet, testSet = split_byID(housing, 0.3, "id")
print(len(trainSet), "train and ", len(testSet), " test")
    

In [None]:
# a predefined function in sklearn does the same as our first, i.e. it 
# is not stable against data changes. It has the advantage that it can be applied
# to >1 table to select from each the same rows (not shown here)

from sklearn.model_selection import train_test_split
trainSet, testSet = train_test_split(housing, test_size=0.25, random_state=42)
print(len(trainSet), "train and ", len(testSet), " test")

In [None]:
# in order to have a test set that is (guaranteed) representative with respect to the
# income attribute, we first add the income as a category (i.e. either 0,1,2,3,4,5) 
# calculated from the (relative) income that is a number (between 0 and > 15)
# by rounding to integers and capping at a value of 5
housing["income_cat"] = np.ceil(housing["median_income"]/ 1.5 )
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].hist()
plt.show()


In [None]:
import sklearn
StratifiedShuffleSplit = sklearn.model_selection.StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
mySplit = splitter.split(housing, housing["income_cat"])
# mySplit is a "generator Object" which is a common concept in python
# a collection that can only be inspected with "for in", i.e. an Iterable
# the for-loop is executed only once, seems to be the equivalence of:
# train_index = mySplit.next()
# test_index = mySplit.next()
for train_index, test_index in mySplit:
    # the produces index-sets seem to be arrays of indices ?
    # because we have no separate row-labels we can use loc and iloc
    strat_train_set = housing.loc[train_index]   
    strat_test_set = housing.iloc[test_index]
    print("len train index:" , len(train_index))

# to prove the representative split:
# watchout: train and test are dataframes with rows sorted wrt
# frequency, must use loc (and not iloc) to get our categories
# in the order that we expect
train = strat_train_set["income_cat"].value_counts() / len(strat_train_set)
test = strat_test_set["income_cat"].value_counts() / len(strat_test_set)
for i in [1.0, 2.0, 3.0, 4.0, 5.0]:
    print("%5d %10.4f %10.4f" % (i, train.loc[i], test.loc[i]))
    


