In [6]:
import numpy as np

#function for splitting data into train set and test set 
#test ratio is percentage of data we want to keep for testing, often 20% (0.2)
def split_train_test(data, test_ratio): 
    #creates a random permutation of the numbers from 0 to len(data) to create random indices
    shuffled_indices = np.random.permutation(len(data))
    #test set size = 20 per cent (test_ratio) of dataset size
    test_set_size = int(len(data) * test_ratio) 
    
    test_indices = shuffled_indices[:test_set_size] #first 80% of shuffled indices
    train_indices = shuffled_indices[test_set_size:] #remaining 20% of shuffled indices
    
    #gets the districts by shuffled indices for train and test
    #"Purely integer-location based indexing for selection by position"
    #data.iloc gets as input a list of integers (first 80/last 20 per cent of shuffeled indices) and 
    #then "collects" all the districts that have those indices
    return data.iloc[train_indices], data.iloc[test_indices]

In [10]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

#calling this function creates a datasets/housing directory in my workspace,downloads the
#housing.tgz file and from that extracts the housing.tgz file in this directory

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    os.makedirs(housing_path, exist_ok = True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
    

In [11]:
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    
    #pd.read_csv reads csv into pandas DataFrame which we are using here
    #https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
    return pd.read_csv(csv_path) 



In [12]:
housing = load_housing_data()

#gives te top five rows of the pandas data frame object
#each row represents one district
housing.head() 



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [13]:
#when we call the function then we get a random split of the data into train and test
train_set, test_set = split_train_test(housing, 0.2)

In [14]:
len(train_set)


16512

In [15]:
len(test_set)


4128

In [16]:
from zlib import crc32 

#zlib is a library for data compression
#zlib.crc32() method computes the unsigned 32-bit checksum for given data

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


In [17]:
#the housing dataset does not have an identifier column, simple solution: using row index as id 
#however here have to always ensure that new data gets appended to the end of the dataset and no row is ever deleted

housing_with_id = housing.reset_index() #pandas function for adding index column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")



In [18]:
#alternative: using most stable features to build unique identifier
#f.e. longitude/latitude will remain stable; we can combine them into a unique id like so:

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")



In [25]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-win_amd64.whl (11.0 MB)
     ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
     --------------------------------------- 0.0/11.0 MB 325.1 kB/s eta 0:00:34
     --------------------------------------- 0.1/11.0 MB 491.5 kB/s eta 0:00:23
     --------------------------------------- 0.1/11.0 MB 653.6 kB/s eta 0:00:17
      -------------------------------------- 0.2/11.0 MB 610.0 kB/s eta 0:00:18
      -------------------------------------- 0.2/11.0 MB 808.4 kB/s eta 0:00:14
      -------------------------------------- 0.2/11.0 MB 719.7 kB/s eta 0:00:15
      -------------------------------------- 0.2/11.0 MB 719.7 kB/s eta 0:00:15
      -------------------------------------- 0.2/11.0 MB 719.7 kB/s eta 0:00:15
     - ------------------------------------- 0.3/11.0 MB 655.5 kB/s eta 0:00:17
     - ------------------------------------


[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
#scikit-learn provides some functions for data set splitting, simplest being train_test_split()
from sklearn.model_selection import train_test_split

#works similarly to split_train_test; offers feature to set a seed and to load multiple datasets of identical
#number of rows that it can then split on same indices (useful if we have f.e. labels seperately)

train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [27]:
#adding an "income_cat" attribute to housing
#pandas.cut() takes the values from the "median_income" column, 
#https://pandas.pydata.org/docs/reference/api/pandas.cut.html
housing["income_cat"] = pd.cut(
                               #1D input array to be binned
                               housing["median_income"], 
                               #bins = sequence of scalars : defines bin edges allowing for non-uniform width
                               bins = [0, 1.5, 3, 4.5, 6, np.inf], 
                               #specifies the labels for the returned bins
                               labels = [1, 2, 3, 4, 5]
                        )


In [28]:
#OWN
#looking at "median_income" again just for clarification
housing["median_income"].max()

np.float64(15.0001)

In [29]:
#OWN
housing["median_income"].head(10)

0    8.3252
1    8.3014
2    7.2574
3    5.6431
4    3.8462
5    4.0368
6    3.6591
7    3.1200
8    2.0804
9    3.6912
Name: median_income, dtype: float64

In [30]:
#OWN
housing["income_cat"].head(10)
#just wanted to see if the income_cat column is there and what it looks like it is
#output also shows categories

0    5
1    5
2    5
3    4
4    3
5    3
6    3
7    3
8    2
9    3
Name: income_cat, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]