In [4]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
obj = fetch_california_housing(as_frame=True)

In [7]:
cali_df = obj.data

In [15]:
cali_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [18]:
X_train, X_test, y_train, y_test = train_test_split(cali_df, obj.target, random_state=1)

In [54]:
def test(model, X_train, y_train, X_test, y_test):
  return f'Training Set R^2 score: {model.score(X_train, y_train)}\nTesting Set R^2 score: {model.score(X_test, y_test)}\n'

Testing with Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
print(test(lr, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.6102859678113064
Testing Set R^2 score: 0.5929869285760099



Testing with Ridge Regression

In [47]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print(test(ridge, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.6102859256477542
Testing Set R^2 score: 0.5929846671576573


Testing with Lasso

In [56]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
print(test(lasso, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.610285606728068
Testing Set R^2 score: 0.5929820246192208



Seems like Linear Models accuracy maxes out at about 0.6, which is not very good at all, let's try other methods:

Testing with Decision Tree Regressor

In [57]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor().fit(X_train, y_train)
print(test(dtr, X_train, y_train, X_test, y_test))

Training Set R^2 score: 1.0
Testing Set R^2 score: 0.5988470412669491



We are overfitting by a lot... let's tune it better by pre-pruning the tree

In [75]:
dtr2= DecisionTreeRegressor(max_leaf_nodes=500).fit(X_train, y_train)
print(test(dtr2, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.8655030234866915
Testing Set R^2 score: 0.6871344785110707



Still not good at all, let's try using random forests:

In [65]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor().fit(X_train, y_train)

In [76]:
print(test(forest, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.9730899132765595
Testing Set R^2 score: 0.8022974276991698



This is the best accuracy we got but it seems like it's overfitting, let's see if we can improve this using gradient-boosted regression trees

In [84]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)

In [85]:
print(test(gbrt, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.8082302521762545
Testing Set R^2 score: 0.7768984468669105



Seems like we're underfitting now! Let's try tuning some parameters:

In [90]:
gbrt2 = GradientBoostingRegressor(random_state=0, n_estimators=10000).fit(X_train, y_train)

In [91]:
print(test(gbrt2, X_train, y_train, X_test, y_test))

Training Set R^2 score: 0.9888423563839455
Testing Set R^2 score: 0.8346027189540158



An R^2 score of 0.85 is about the highest score we can achieve on unseen data for the Californian housing dataset!