In [None]:
import os

import kagglehub
import pandas as pd

from src.config import DATASET

# Download latest version
path = kagglehub.dataset_download(DATASET)  # saved in cache

print("Path to dataset files:", path)


def load_housing_data():
    # loaded to pandas
    return pd.read_csv(os.path.join(path, "housing.csv"))


housing = load_housing_data()

In [None]:
def print_corr(data):
    # Select only the numeric columns
    numeric_data = data.select_dtypes(include=[float, int])

    # Calculate the correlation matrix on the numeric data
    corr_matrix = numeric_data.corr()
    corr_matrix["median_house_value"].sort_values(ascending=False)
    # Display the correlation matrix
    print(corr_matrix)

In [None]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

print_corr(housing)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from utils.figures import save_fig

housing.reset_index()
num_pipeline = Pipeline(
    [
        ("impute", SimpleImputer(strategy="median")),  # insert missing values
        ("scaler", MinMaxScaler(feature_range=(-1, 1))),  # scale num values
    ]
)

housing_num = housing.select_dtypes(include=[np.number])  # numeric values
housing_num_tarnsformed_arrays = num_pipeline.fit_transform(housing_num)
housing_num_transformed_df = pd.DataFrame(
    housing_num_tarnsformed_arrays, columns=housing_num.columns, index=housing_num.index
)


# plot changes
plt.rc("font", size=14)
plt.rc("axes", labelsize=14, titlesize=14)
plt.rc("legend", fontsize=14)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

housing_num_transformed_df.hist(bins=50, figsize=(12, 8))
save_fig("attribute_histogram_plots_after_scaling")  # extra code
plt.show()

In [None]:
housing_cat = housing[["ocean_proximity"]]  # categorial attribute
housing_cat.head(8)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One Hot encoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
# Convert to a NumPy array
housing_cat_1hot_dense = housing_cat_1hot.toarray()
# Get the feature names (categories) from the encoder
categories = cat_encoder.get_feature_names_out(input_features=housing_cat.columns)
# Convert to a pandas DataFrame
housing_cat_1hot_df = pd.DataFrame(
    housing_cat_1hot_dense, columns=categories, index=housing_cat.index
)
housing_transformed = pd.concat(
    [housing_num_transformed_df, housing_cat_1hot_df], axis=1
)
print(housing_transformed["median_income"].isnull().sum())

#### Train Model


In [None]:
# set seed to ensure same output for multiple runs
import numpy as np
from sklearn.model_selection import train_test_split

random_seed = 3

# ensure instances from each median income stratum
housing_transformed["income_cat"] = pd.cut(
    housing_transformed["median_income"],
    bins=[-1.1, -0.6, -0.2, 0.2, 0.6, 1.1],
    labels=[1, 2, 3, 4, 5],
)

housing_transformed["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

# use the stratisfied splitter
strat_train_set, strat_test_set = train_test_split(
    housing_transformed,
    test_size=0.2,
    stratify=housing_transformed["income_cat"],
    random_state=42,
)

# income_cat not needed anymore
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
from pickle import dump

from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

from src.config import MODEL_PATH

housing_labels_train = strat_train_set["median_house_value"]
model = TransformedTargetRegressor(LinearRegression(), transformer=StandardScaler())
model.fit(strat_train_set[["median_income"]], housing_labels_train)

model_file_path = os.path.join(MODEL_PATH, "price_estimator_model.pkl")

with open(model_file_path, "wb") as file:
    dump(model, file, protocol=5)  # set protocol 5 to reduce memory usage