In [None]:
# import the required modules
import os
import ssl
import tarfile
import numpy as np
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# set the root url for downloading the data
download_root = "https://raw.githubusercontent.com/ageron/handson-ml2/master"

# set the path for saving the housing data
housing_path = os.path.join("datasets")

# set the url for downloading the housing data
housing_url = download_root + "/datasets/housing/housing.tgz"

# function to fetch the housing data
def fetch_housing_data(housing_url = housing_url, housing_path = housing_path):

    # create the housing directory if it does not exist
    os.makedirs(housing_path, exist_ok = True)

    # set the path for saving the housing data
    tgz_path = os.path.join(housing_path, "housing.tgz")

    # get the housing data from the url and save it
    urllib.request.urlretrieve(housing_url, tgz_path)

    # open the housing data
    housing_tgz = tarfile.open(tgz_path)

    # extract the housing data
    housing_tgz.extractall(path = housing_path)

    # close the housing data
    housing_tgz.close()

# disable ssl certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

# call the function to fetch the housing data
fetch_housing_data()

In [None]:
# function to load the housing data into a pandas dataframe
def load_housing_data(housing_path = housing_path):

    # set the path for loading the csv file for the housing data
    csv_path = os.path.join(housing_path, "housing.csv")

    # read the csv file into a pandas dataframe and return it
    return pd.read_csv(csv_path)

# call the function to load the housing data into the housing dataframe
housing = load_housing_data()

In [None]:
# display a summary of the housing dataframe
housing.info()

In [None]:
# display the statistical summary of the numerical columns in the housing dataframe
housing.describe()

In [None]:
# display the first few rows of the housing dataframe
housing.head()

In [None]:
# generate historgrams of each column in the housing dataframe
housing.hist(bins = 50, figsize = (20, 15))

# display the plots
plt.show()

In [None]:
# split the dataset so that twenty percent is used for testing and set a random set for reproducibility
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [None]:
# display the first few rows of the test set
test_set.head()

In [None]:
# create a new column in the housing dataframe which uses the cut function to categorizes the median incomes into categories
# the bins parameter defines the ranges for each income category
# the labels parameter assigns custom labels to each income category
housing["income_category"] = pd.cut(housing["median_income"], bins = [0., 1.5, 3.0, 4.5, 6., np.inf], labels = [1, 2, 3, 4, 5])

In [None]:
# dispay the count of values in each income category
housing["income_category"].value_counts()

In [None]:
# generate a histogram of the income category column
housing["income_category"].hist()

In [None]:
# initialise a stratified shuffle split object with one split and a test size of twenty percent with a random state of forty two for reproducibility
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

# iterate over the splits and assign each dataset into stratified train and test sets based on the income category
for train_index, test_index in split.split(housing, housing["income_category"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# calculate the proportion of each income category in the stratified test set to understand the distribution of income categories in the test set
strat_test_set["income_category"].value_counts() / len(strat_test_set)

In [None]:
# calculate the proportion of each income category in the entire housing dataset to understand the distribution of income categories
housing["income_category"].value_counts() / len(housing)

In [None]:
# remove the income category column from the stratified train and test sets before training a machine learning model
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_category", axis = 1, inplace = True)

In [None]:
# create a copy of the stratified train set to explore the data
housing = strat_train_set.copy()

In [None]:
# generate a scatter plot of the long and lat columns with an alpha to visualize the density of the data points
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.1)

In [None]:
# create a scatter plot to visualize the geographical distribution of housing data
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4, s = housing["population"] / 100, label = "population", figsize = (10, 7), c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True)

# display the plot
plt.legend()

In [None]:
# calculate the ratio of total rooms per household
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]

# calculate the ratio of total bedrooms per room
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]

# calculate the ratio of population per household
housing["population_per_household"] = housing["population"] / housing["households"]

In [None]:
# calculate the correlation matrix to explore relationships between variables
corr_matrix = housing.corr()

In [None]:
# sort the correlation values of the median house value column in descending order
# correlation coefficients close to one indicate strong positive correlation like the median house value tends to rise when median income rises
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
# generate a scatter plot to explore the relationship between the median house value and the median income
housing.plot(kind = "scatter", x = "median_income", y = "median_house_value", alpha = 0.1)

In [None]:
# prepare the data for machine learning algorithms
# separate features and labels to train an ml model on features variables and housing labels evaluating the performance of the model making predictions potentially on unseen data
# creating a new dataframe that drops the target variable
housing = strat_train_set.drop("median_house_value", axis = 1)

# create a copy of the target variable as the labels
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# impute missing values in the dataset with the median value of the column
# create an instance of the simple imputer class with the median strategy set
imputer = SimpleImputer(strategy = "median")

In [None]:
# create a new dataframe that drops the text ocean proximity column
housing_num = housing.drop("ocean_proximity", axis = 1)

In [None]:
# fit the imputer to the housing numerical data
imputer.fit(housing_num)

In [None]:
# retrieve the learned statistics from the imputer
imputer.statistics_

In [None]:
# retrieve the median values of each column in the housing numerical data
housing_num.median().values

In [None]:
# transform the housing numerical dataframe using the imputer
x = imputer.transform(housing_num)

In [None]:
# create a new dataframe from the transformed data
housing_tr = pd.DataFrame(x, columns = housing_num.columns, index = housing_num.index)