In [None]:
# define the function fetch_housing that
# downloads a csv-file with testdata from a url into a local file 

import os         # create and read local files
import tarfile    # extract from tarfiles
import requests   # download from url

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL =  DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    
    # create directory where we will save a tar-file
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    
    # perform a get on the url and save the content to our local tar-file
    socket = requests.get(housing_url, stream=True)
    with open(tgz_path, 'wb') as fd:
        for chunk in socket.iter_content(chunk_size=128):
            fd.write(chunk)
            
    # open the local tar-file and extract its content in the same directory
    housing_tgz =tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)    # this extracts housing.csv
    housing_tgz.close()


In [None]:
fetch_housing_data()

In [None]:
# define the function load_housing
# that constructs a panda data-object from a local csv-file

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
    
# numpy and matplotlib will be required in many cells
import numpy as np
import matplotlib.pyplot as plt 
    

In [None]:
# create the data-object housing and list the first 5 lines
housing = load_housing_data()
housing.head()   # shows the first lines


In [None]:
housing.info()   # shows number/types of lines/columns


In [None]:
housing["ocean_proximity"].value_counts()  # categorizes and counts values for one col

In [None]:
housing.describe()      # simple statistics on numeric values

In [None]:
# show histograms of all numeric data in housing
housing.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
# split the dataset in training- and testset
# the following method would randomly separate a ration of data points
# where we use a constant seed to make sure we make the same choice next time
import numpy as np

def split_random(data, testRatio):
    np.random.seed(42)
    randomIndices = np.random.permutation(len(data))
    testSize = int(len(data) * testRatio)
    testIndices = randomIndices[:testSize]
    trainIndices = randomIndices[testSize:]
    return data.iloc[trainIndices], data.iloc[testIndices]

trainSet, testSet = split_random(housing, 0.2)
print(len(trainSet), "train and ", len(testSet), " test")


In [None]:
# to have a split that is also stable when part of the data is 
# deleted / completed / added it is a nice idea to base the split
# on the hash of an ID, that here we first construct from a stable attribute

import hashlib

housing_withID = housing
housing_withID["id"] = housing["longitude"]*1000 + housing["latitude"]

# a function that returns true for a given proportion of ids
# this is called by the lambda expression below to filter an array of ids
def isInTest(id, testRatio, hash):
    hashedID = hash(np.int64(id))
    return hashedID.digest()[-1] < 256 * testRatio   # compare the last byte of the hash ?

def split_byID(data, testRatio, idColumn, hash=hashlib.md5):
    ids = data[idColumn]
    testIDs = ids.apply(lambda id_ : isInTest(id_, testRatio, hash))
    # print(testIDs) # this is a Dataframe column of booleans, i.e. not
    # just a 1D array of booleans or indices, but something like:
    #  0   false
    #  1   false
    #  2   true
    #  ...
    return data.loc[-testIDs], data.loc[testIDs]
    
trainSet, testSet = split_byID(housing, 0.3, "id")
print(len(trainSet), "train and ", len(testSet), " test")
    

In [None]:
# a predefined function in sklearn does the same as our first, i.e. it 
# is not stable against data changes. It has the advantage that it can be applied
# to >1 table to select from each the same rows (not shown here)

from sklearn.model_selection import train_test_split
trainSet, testSet = train_test_split(housing, test_size=0.25, random_state=42)
print(len(trainSet), "train and ", len(testSet), " test")

In [None]:

# in order to have a test set that is (guaranteed) representative with respect to the
# income attribute, we first add the income as a category (i.e. either 0,1,2,3,4,5) 
# calculated from the (relative) income that is a number (between 0 and > 15)
# by rounding to integers and capping at a value of 5
housing["income_cat"] = np.ceil(housing["median_income"]/ 1.5 )
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].hist()
plt.show()


In [None]:
import sklearn
StratifiedShuffleSplit = sklearn.model_selection.StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
mySplit = splitter.split(housing, housing["income_cat"])
# mySplit is a "generator Object" which is a common concept in python
# a collection that can only be inspected with "for in", i.e. an Iterable
# the for-loop is executed only once, seems to be the equivalence of:
# train_index = mySplit.next()
# test_index = mySplit.next()
for train_index, test_index in mySplit:
    # the produces index-sets seem to be arrays of indices ?
    # because we have no separate row-labels we can use loc and iloc
    strat_train_set = housing.loc[train_index]   
    strat_test_set = housing.iloc[test_index]
    print("len train index:" , len(train_index))

# to prove the representative split:
# watchout: train and test are dataframes with rows sorted wrt
# frequency, must use loc (and not iloc) to get our categories
# in the order that we expect
train = strat_train_set["income_cat"].value_counts() / len(strat_train_set)
test = strat_test_set["income_cat"].value_counts() / len(strat_test_set)
for i in [1.0, 2.0, 3.0, 4.0, 5.0]:
    print("%5d %10.4f %10.4f" % (i, train.loc[i], test.loc[i]))
    




In [None]:
# visualize data, first the geographical data
housing = strat_train_set
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1
             , s=np.log(housing["population"]), c="median_house_value"
             , cmap=plt.get_cmap("jet"), colorbar=True,
            label="population")


In [None]:
# calculate the correlation matrix for all attributes, and sort for correlation with house_value
correlation = housing.corr()
correlation["median_house_value"].sort_values()

In [None]:
# produce the matrix of scatterplots, here only for three attributes
from pandas.plotting import scatter_matrix
attributes=["median_house_value", "median_income", "total_rooms"]
scatter_matrix(housing[attributes], figsize=(12,8))
plt.show()

In [None]:
# note that in the most important scatter-plot (income vs value) we have quirks in the data
# i.e. horizontal lines that must have artificial origin
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

In [None]:
# since neither the total number of rooms / bedrooms (per district) nor the total
# population seem meaningfull, we combine with number of households and recalculate
# correlation to see that (obviously) rooms-per-house correlates and (surprisingly)
# bedrooms-per-room had a strong (negative) correlation
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_house"]= housing["population"] / housing["households"]
housing.corr()["median_house_value"].sort_values()



In [None]:
# Prepare the data, we start with our stratified split, and now we also separate
# the predictors ("x-data") from the labels "y-value"
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Most columns are complete, but for some districts the column total_bedrooms
# is missing. We could crop the complete column (with drop) or use "dropna" to remove districts
# that dont have this value or use "fillna" to complete those missing values with the median of others:
# (the correct approach is to later use the exact same median of training data for missing test data)
housing_noNA = housing.dropna(subset=["total_bedrooms"])   # doesn't change housing itself
housing.fillna(housing["total_bedrooms"].median(), inplace=True) # changes housing due to inplace=true
housing.info()

In [None]:
# the same processing is implemented in sklearn by the Imputer class. We must remove the ocean-proximity 
# because it is non-numeric and would lead to an error, also Imputer produces a numpy-array which we 
# must transform back in a dataframe. The important point: the Imputer is a Transformer, and data-preparation
# is typically performed in a pipeline of different transformers, each of them is instantiated, then
# prepared with a transformer.fit() e.g. to calculate here the median and applied with transformer.transform()

from sklearn.impute import SimpleImputer 
housing_num = housing.drop("ocean_proximity", axis=1)
imputer = sklearn.impute.SimpleImputer(strategy="median")
imputer.fit(housing_num)
(imputer.statistics_ - housing_num.median().values).sum()  # check: imputer.statistics_ has median values
housing_fillna = pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns)


In [None]:
# TODO: add ocean proximity 
# with factorize we could transform text to numeric categories
prox_col, prox_cats = housing["ocean_proximity"].factorize()
prox_col     # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0, .....])
prox_cats    # Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')


# but ML-routines would assume category 3 and 4 to be more similar
# than 3 and 5, which is not the case. For unrelated categories,
# a one-hot encoding is better: each value is replaced by an 
# scipy.sparsearray of booleans with only one true-value. 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto')         # categories=auto: if all values are either 3 or 17
                                                   # the one-hot encoder has length 2, not 17
prox_col = prox_col.reshape(-1,1)                  # reshapes to 2D array with one column
prox_col_oneHot = encoder.fit_transform(prox_col)

# the old method required to transform text to ints, then reshape
# now conversion to ints no longer needed, the reshape (which requires numpy.array) still is:
encoder = OneHotEncoder()   # 
prox_col_oneHot = encoder.fit_transform(np.array(housing["ocean_proximity"]).reshape(-1,1))

# toarray() produces a conventional dense array from a scipy sparse array 
# here for illustration purpose only:
print(prox_col_oneHot.toarray())
encoder.categories_




In [None]:
WEITER ab Custom Transformers