In [2]:
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print(df.head())

X = df.drop('target', axis=1)
y = df['target']

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  


In [4]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

model = RandomForestRegressor(random_state=1)

model.fit(train_X, train_y)

predictions = model.predict(val_X)
mae = mean_absolute_error(predictions, val_y)

print(mae)

0.3317725336821707


## First Optimization: Max number of leaves

In [5]:
def get_mae(max_number_of_leaves, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_number_of_leaves, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    return mean_absolute_error(val_y, predictions)

In [6]:
scores = {max_number: get_mae(max_number, train_X, val_X, train_y, val_y) for max_number in [5, 50, 500, 5000]}
print(scores)    

{5: 0.6111935999227869, 50: 0.4367870719399872, 500: 0.3538815036221463, 5000: 0.3308410005839568}


In [7]:
print(get_mae(10000, train_X, val_X, train_y, val_y))

0.3307828782364343


In [8]:
print(get_mae(6000, train_X, val_X, train_y, val_y))    

0.33075140254771396


## Second Optimization: Data Cleaning
Since the dataset had 0 missing values, the previous cleaning step didn't improve accuracy.

In [9]:
train_X_clean = train_X.fillna(train_X.median())

model = RandomForestRegressor(max_leaf_nodes=6000, random_state=1)
model.fit(train_X_clean, train_y)
predictions = model.predict(val_X)

print(mean_absolute_error(val_y, predictions))

0.33075140254771396


In [10]:
print(train_X.isnull().sum().sum())

0


## Third Optimization: Feature Engineering

In [11]:
train_X_fe = train_X.copy()
val_X_fe = val_X.copy()

train_X_fe['Bedroom_Ratio'] = train_X_fe['AveBedrms'] / train_X_fe['AveRooms']
val_X_fe['Bedroom_Ratio'] = val_X_fe['AveBedrms'] / val_X_fe['AveRooms']

model = RandomForestRegressor(max_leaf_nodes=6000, random_state=1)
model.fit(train_X_fe, train_y)

predictions = model.predict(val_X_fe)

mae_fe = mean_absolute_error(val_y, predictions)
print(f"MAE with Feature Engineering: {mae_fe}")

MAE with Feature Engineering: 0.3330304050278228


In [12]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


## Fourth Optimization: Better Algorithm (Gradient Boosting)

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, random_state=1)
# n estimators = number of small trees
# learning rate = each tree contributes only a little

gb_model.fit(train_X, train_y)

0,1,2
,"loss  loss: {'squared_error', 'absolute_error', 'huber', 'quantile'}, default='squared_error' Loss function to be optimized. 'squared_error' refers to the squared error for regression. 'absolute_error' refers to the absolute error of regression and is a robust loss function. 'huber' is a combination of the two. 'quantile' allows quantile regression (use `alpha` to specify the quantile). See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py` for an example that demonstrates quantile regression for creating prediction intervals with `loss='quantile'`.",'squared_error'
,"learning_rate  learning_rate: float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. Values must be in the range `[0.0, inf)`.",0.05
,"n_estimators  n_estimators: int, default=100 The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. Values must be in the range `[1, inf)`.",1000
,"subsample  subsample: float, default=1.0 The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. Values must be in the range `(0.0, 1.0]`.",1.0
,"criterion  criterion: {'friedman_mse', 'squared_error'}, default='friedman_mse' The function to measure the quality of a split. Supported criteria are ""friedman_mse"" for the mean squared error with improvement score by Friedman, ""squared_error"" for mean squared error. The default value of ""friedman_mse"" is generally the best as it can provide a better approximation in some cases. .. versionadded:: 0.18",'friedman_mse'
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, values must be in the range `[2, inf)`. - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split`  will be `ceil(min_samples_split * n_samples)`. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, values must be in the range `[1, inf)`. - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf`  will be `ceil(min_samples_leaf * n_samples)`. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. Values must be in the range `[0.0, 0.5]`.",0.0
,"max_depth  max_depth: int or None, default=3 Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. If int, values must be in the range `[1, inf)`.",3
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. Values must be in the range `[0.0, inf)`. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [16]:
gb_predictions = gb_model.predict(val_X)
gb_mae = mean_absolute_error(val_y, gb_predictions)
print(f"Gradient Boosting MAE: {gb_mae}")

Gradient Boosting MAE: 0.3256569523895364
