In [None]:
# define the function fetch_housing that
# downloads a csv-file with testdata from a url into a local file 

import os         # create and read local files
import tarfile    # extract from tarfiles
import requests   # download from url

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL =  DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    
    socket = requests.get(housing_url, stream=True)
    with open(tgz_path, 'wb') as fd:
        for chunk in socket.iter_content(chunk_size=128):
            fd.write(chunk)
        
    housing_tgz =tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)    # this extracts housing.csv
    housing_tgz.close()


In [None]:
fetch_housing_data()

In [None]:
# define the function load_housing
# that constructs a panda data-object from a local csv-file

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
    
    

In [None]:
# create the data-object housing and list the first 5 lines
housing = load_housing_data()
housing.head()   # shows the first lines


In [None]:
housing.info()   # shows number/types of lines/columns

In [None]:
housing["ocean_proximity"].value_counts()  # categorizes and counts values for one col

In [None]:
housing.describe()      # simple statistics on numeric values

In [None]:
# not part of the handson book:
# a first example using matplotlib
import matplotlib.pyplot as plt
import numpy as np

# Data for plotting
t = np.arange(0.0, 2.0, 0.01)
s = 1 + np.sin(2 * np.pi * t)
# Note that using plt.subplots below is equivalent to using
# fig = plt.figure() and then ax = fig.add_subplot(111)
fig, ax = plt.subplots()
ax.plot(t, s)
ax.set(xlabel='time (s)', ylabel='voltage (mV)',
       title='About as simple as it gets, folks')
# ax.grid()
# fig.savefig("test.png")
plt.show()

In [None]:
# show histograms of all numeric data in housing
housing.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
# split the dataset in training- and testset

# select 3 random indices of an array w. size=10 (i.e. 30%)
np.random.seed(42)                         # make sure to always have same result
randomIndices = np.random.permutation(10)
first3 = randomIndices[:3]
print(first3)

#
