In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import cross_val_score, learning_curve, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
sns.set_style("whitegrid")

# Let's take a look at the dataset

In [None]:
dataset = pd.read_csv("../input/california-housing-prices/housing.csv")

In [None]:
dataset

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

Let's create a heatmap to see which features are correlated the most with the label. We'll run it again after creating some interaction (also called synthetic) features.

In [None]:
correlation_matrix = dataset.corr()

In [None]:
sns.heatmap(correlation_matrix)

In [None]:
correlation_matrix["median_house_value"].drop(["median_house_value"]).sort_values(ascending=False)

In [None]:
#import matplotlib.image as mpimg

#california_map = mpimg.imread("./california_map.gif")

min_x, max_x = dataset["longitude"].min(), dataset["longitude"].max()
min_y, max_y = dataset["latitude"].min(), dataset["latitude"].max()

p = dataset.plot(kind="scatter",
                 x="longitude",
                 y="latitude",
                 s=dataset["population"]*0.05,
                 alpha=0.5,
                 c=dataset["median_house_value"],
                 cmap="jet")

#p.imshow(california_map, extent=[min_x, max_x, min_y, max_y], alpha=0.5)

In [None]:
import math

def hist_all(dataset):
    fig, ax = plt.subplots(nrows=int(math.ceil(len(dataset.columns)/2)), ncols=2, figsize=(20, 20))
    for i, col in enumerate(dataset.columns):
        a = sns.histplot(dataset[col], ax=ax.flatten()[i])

In [None]:
hist_all(dataset.drop(["longitude", "latitude"], axis=1))

In [None]:
def scatter_against_median_value(dataset):
    fig, ax = plt.subplots(nrows=len(dataset.columns), ncols=1, figsize=(20, 60))
    for i, col in enumerate(dataset.drop(["median_house_value"], axis=1).columns):
        a = sns.scatterplot(data=dataset, x=col, y="median_house_value", ax=ax.flatten()[i])

In [None]:
scatter_against_median_value(dataset)

It's worth noting a couple of things:

First off, what we have are mostly discrete variables. _ocean\_proximity_ is an obvious one (since it's a categorical variable), but also the age, number of rooms/bedrooms and households will be integers. Only population and median income are continuous. We'll look at latitude and longitude separately, since they present a different kind of information.

Secondly, we see there are some artifacts in the data. They are visible as horizontal lines in our scatterplots. It seems that some values were rounded to the nearest value. Also, the data seems to be "capped" or rounded down to 500k in median house value.

Let's copy the dataset into __dataset_expl__ and try some magic to make it more useful for us. But first, we'll create a baseline by training and evaluating a LinearRegression model, so that we can evaluate our future modifications against it.

So we are miscalculating the price by (roughly) 1/3rd of the average house value. Not awe inspiring, but it's just the baseline we'll (hopefully) improve on as we dial in our final model shape.

Let's continue by deleting all rows that have __median_house_value__ set to 500k. If that's a lot of rows, me might reconsider!

In [None]:
len(dataset[dataset["median_house_value"] == 500_000])

Let's now look at outliers from the other (numerical) columns. We'll use the z-score for this.

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))
for i, col in enumerate(dataset.drop(["longitude", "latitude", "ocean_proximity"], axis=1).columns):
    zscore_limit = np.mean(dataset[col]) + 3*(np.std(dataset[col]))
    graph = sns.histplot(dataset[col], ax=ax.flatten()[i])
    graph.vlines(zscore_limit, 0, 800, color="r")

In [None]:
def boxplot_all(dataset):
    fig, ax = plt.subplots(nrows=int(math.ceil(len(dataset.columns)/2)), ncols=2, figsize=(20, 20))
    for i, col in enumerate(dataset.columns):
        a = sns.boxplot(data=dataset, x=col, ax=ax.flatten()[i])

In [None]:
boxplot_all(dataset.drop(["ocean_proximity"], axis=1))

So those graphs give us two separate views of outliers.

As we can see, the boxplot (which uses IQR) is much more "strict" in deciding what is an outlier, as opposed to calculating the zscore and drawing the line at 3x standard deviations from the mean. We probably should decide between the two (or a variation, like choosing more/less than 3x the standard deviations for the zscore) separately for each variable. We might also decide not to remove any outliers.

The outliers don't seem to be concerning, as they do form a continous distribution with the rest of the data. We will not be removing any data at this stage.

Let's try creating synthetic features now. Here are some ideas:
- median income per household
- median income per person
- people per household
- rooms per household
- bedrooms per household

We'll create all fo those, and check the correlation matrix to see if they seem relevant.

In [None]:
dataset_synth = dataset.copy()

But first, we need to impute the missing values to aviod errors

In [None]:
dataset_synth["ocean_proximity"] = OrdinalEncoder().fit_transform(dataset_synth[["ocean_proximity"]])
dataset_synth[:] = SimpleImputer().fit_transform(dataset_synth)

In [None]:
dataset_synth["income_per_household"] = dataset_synth["median_income"] / dataset_synth["households"]
dataset_synth["income_per_person"] = dataset_synth["median_income"] / dataset_synth["population"]
dataset_synth["population_per_household"] = dataset_synth["population"] / dataset_synth["households"]
dataset_synth["rooms_per_household"] = dataset_synth["total_rooms"] / dataset_synth["households"]
dataset_synth["bedrooms_per_household"] = dataset_synth["total_bedrooms"] / dataset_synth["households"]

In [None]:
correlation_matrix = dataset_synth.corr()

In [None]:
sns.heatmap(correlation_matrix)

In [None]:
correlation_matrix["median_house_value"].drop(["median_house_value"]).sort_values(ascending=False)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.selected_columns]

In [None]:
columns = list(dataset.drop(["median_house_value"], axis=1).columns)

income_idx = columns.index("median_income")
households_idx = columns.index("households")
population_idx = columns.index("population")
rooms_idx = columns.index("total_rooms")
bedrooms_idx = columns.index("total_bedrooms")

class AddSyntheticFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        income_per_household = X[:, income_idx] / X[:, households_idx]
        income_per_person = X[:, income_idx] / X[:, population_idx]
        population_per_household = X[:, population_idx] / X[:, households_idx]
        rooms_per_household = X[:, rooms_idx] / X[:, households_idx]
        bedrooms_per_household = X[:, bedrooms_idx] / X[:, households_idx]
        
        return np.c_[X,
                     income_per_household, income_per_person,
                     population_per_household, rooms_per_household,
                     bedrooms_per_household]

In [None]:
numerical_columns = list(dataset.drop(["ocean_proximity", "median_house_value"], axis=1).columns)
categorical_columns = ["ocean_proximity"]

In [None]:
numerical_columns

In [None]:
pipeline_num = Pipeline(
    steps=[
        ("select_columns", ColumnSelector(numerical_columns)),
        ("impute", SimpleImputer(strategy="mean")),
        ("add_synth_features", AddSyntheticFeatures()),
        ("scale", StandardScaler()),
    ])

pipeline_cat = Pipeline(
    steps=[
        ("select_columns", ColumnSelector(categorical_columns)),
        ("ordinal_encode", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=10))
    ])

In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
preprocess = FeatureUnion(n_jobs=-1,
                          transformer_list=[
                              ("num_columns", pipeline_num),
                              ("cat_columns", pipeline_cat)
                          ])

In [None]:
def make_predictor(model):
    _predictor = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", model)
        ])
    return _predictor

In [None]:
rf_pipeline = make_predictor(RandomForestRegressor(random_state=0))

In [None]:
gb_pipeline = make_predictor(GradientBoostingRegressor(random_state=0))

In [None]:
def score_model(model, X, y):
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error")
    return np.mean(np.sqrt(np.abs(scores)))

In [None]:
# Create a variable to stratify the dataset split by
dataset["median_income_class"] = pd.cut(dataset["median_income"], bins=5)

In [None]:
dataset_train, dataset_test = train_test_split(dataset, stratify=dataset[["median_income_class"]])
dataset_train = dataset_train.drop(["median_income_class"], axis=1)
dataset_test = dataset_test.drop(["median_income_class"], axis=1)

We will be removing rows which have a price of 500k dollars, because (as we saw in the viz stage) there seems to be a lot of data that is either wrong, or cause by some sort of weird rounding/compression. We do this only for the training stage, though. We will not be removing those elements when testing our final model - it should still give a good, "generalised" outcome.

In [None]:
X_train = dataset_train[dataset_train["median_house_value"] < 500_000].drop(["median_house_value"], axis=1)
y_train = dataset_train[dataset_train["median_house_value"] < 500_000]["median_house_value"].copy()
avg_price_train = y_train.mean()

In [None]:
scores = dict()

In [None]:
scores['rf'] = score_model(rf_pipeline, X_train, y_train)

In [None]:
scores['gb'] = score_model(gb_pipeline, X_train, y_train)

In [None]:
results = {"algo": scores.keys(),
              "score_usd": [score for score in scores.values()],
              "score_vs_avg": [(score/avg_price_train)*100 for score in scores.values()]}

In [None]:
pd.DataFrame(results).set_index("algo")

In [None]:
There is a slight edge to a Random Forest, but we'll investigate some more.

In [None]:
from sklearn.model_selection import learning_curve

def draw_learning_curves(model, X, y, label=None): 
    size, train_score, test_score = learning_curve(model, X, y, scoring="neg_mean_squared_error")
    train_scores_mean = np.mean(np.sqrt(np.abs(train_score)), axis=1)
    train_scores_std = np.std(np.sqrt(np.abs(train_score)), axis=1)
    
    test_scores_mean = np.mean(np.sqrt(np.abs(test_score)), axis=1)
    test_scores_std = np.std(np.sqrt(np.abs(test_score)), axis=1)
    
    _, ax = plt.subplots(nrows=1, ncols=1)
    
    ax.plot(size, train_scores_mean, c="g")
    ax.fill_between(size, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="g")
    
    ax.plot(size, test_scores_mean, c="r")
    ax.fill_between(size, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r")
    
    if label is not None:
        ax.set_title(label)        
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")   

In [None]:
draw_learning_curves(model=rf_pipeline, X=X_train, y=y_train, label="Random Forest")

In [None]:
draw_learning_curves(model=gb_pipeline, X=X_train, y=y_train, label="Gradient Boosting Regressor")

In [None]:
X_test = dataset_test.drop(["median_house_value"], axis=1)
y_test = dataset_test["median_house_value"].copy()
avg_price_test = y_test.mean()

In [None]:
#rf_pipeline.set_params(model__max_depth=3)

In [None]:
gb_pipeline.fit(X_train, y_train)

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
score_rf = cross_val_score(rf_pipeline, X_test, y_test, scoring="neg_mean_squared_error")
score_rf = np.mean(np.sqrt(np.abs(score_rf)))

In [None]:
score_gb = cross_val_score(gb_pipeline, X_test, y_test, scoring="neg_mean_squared_error")
score_gb = np.mean(np.sqrt(np.abs(score_gb)))

In [None]:
print("Results on unseen data: ")
print("Random forest regressor ${:.2f} which is {:.2f}% of average price.".format(score_rf, (score_rf/avg_price_test)*100))
print("Gradient boosting regressor ${:.2f} which is {:.2f}% of average price.".format(score_gb, (score_gb/avg_price_test)*100))