<a href="https://colab.research.google.com/github/samipn/autogluon/blob/main/02_kaggle_california_housing_autogluon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# California Housing Prices with AutoGluon (Regression)

*Colab-ready | Last prepared: 2025-10-14*

This notebook downloads the **California Housing Prices** Kaggle dataset and trains an AutoGluon regressor.


In [1]:
# Install
!pip -q install -U pip setuptools wheel
!pip -q install -U autogluon kaggle xgboost lightgbm catboost


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.8 MB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m1.3/1.8 MB[0m [31m12.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is

In [2]:
# Kaggle auth (same flow as other notebook)
import os, json, pathlib
from pathlib import Path
KAGGLE_DIR = Path('~/.kaggle').expanduser(); KAGGLE_DIR.mkdir(parents=True, exist_ok=True)
kaggle_json = KAGGLE_DIR / 'kaggle.json'
if not kaggle_json.exists():
    try:
        from google.colab import files  # type: ignore
        print("Upload your kaggle.json (Account > Create API Token)")
        uploaded = files.upload()
        if 'kaggle.json' in uploaded:
            with open(kaggle_json, 'wb') as f: f.write(uploaded['kaggle.json'])
            os.chmod(kaggle_json, 0o600)
            print("✅ kaggle.json configured.")
    except Exception as e:
        print("If not in Colab, place kaggle.json into ~/.kaggle/kaggle.json and re-run.")
else:
    os.chmod(kaggle_json, 0o600)
    print("✅ kaggle.json ready.")


Upload your kaggle.json (Account > Create API Token)


Saving kaggle.json to kaggle.json
✅ kaggle.json configured.


In [3]:
# Download dataset and unzip
!mkdir -p data/california
!kaggle datasets download -d camnugent/california-housing-prices -p data/california -q
import glob, zipfile, os
for z in glob.glob('data/california/*.zip'):
    with zipfile.ZipFile(z, 'r') as f:
        f.extractall('data/california')
    os.remove(z)
print("✅ Data ready in data/california")
!ls -l data/california | head -n 5


Dataset URL: https://www.kaggle.com/datasets/camnugent/california-housing-prices
License(s): CC0-1.0
✅ Data ready in data/california
total 1392
-rw-r--r-- 1 root root 1423529 Oct 22 20:32 housing.csv


In [6]:
# Load, split, train
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/california/housing.csv')
label = 'median_house_value'

# Fill missing values in 'total_bedrooms' with the mean
df['total_bedrooms'].fillna(df['total_bedrooms'].mean(), inplace=True)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
predictor = TabularPredictor(label=label, problem_type='regression', eval_metric='rmse', path='ag_california/')
predictor.fit(train_df, time_limit=600, presets='medium_quality_faster_train')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].mean(), inplace=True)
Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          8
Memory Avail:       48.48 GB / 50.99 GB (95.1%)
Disk Space Avail:   178.91 GB / 225.83 GB (79.2%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit

[1000]	valid_set's rmse: 49409.6
[2000]	valid_set's rmse: 48518.2
[3000]	valid_set's rmse: 48360
[4000]	valid_set's rmse: 48312.8
[5000]	valid_set's rmse: 48307.8


	-48253.7613	 = Validation score   (-root_mean_squared_error)
	4.18s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 595.45s of the 595.44s of remaining time.
	Fitting with cpus=4, gpus=0, mem=0.0/48.5 GB


[1000]	valid_set's rmse: 45813.7


	-45803.2746	 = Validation score   (-root_mean_squared_error)
	1.31s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 594.07s of the 594.07s of remaining time.
	Fitting with cpus=8, gpus=0
	-50122.5793	 = Validation score   (-root_mean_squared_error)
	6.2s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 585.39s of the 585.39s of remaining time.
	Fitting with cpus=4, gpus=0
	-43677.8424	 = Validation score   (-root_mean_squared_error)
	76.82s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 508.53s of the 508.53s of remaining time.
	Fitting with cpus=8, gpus=0
	-53232.3101	 = Validation score   (-root_mean_squared_error)
	1.54s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 504.79s of the 504.79s of remaining time.
	Fitting with cpus=4, gpus=0, mem=0

[1000]	valid_set's rmse: 44753.9
[2000]	valid_set's rmse: 44647.9


	-44643.2277	 = Validation score   (-root_mean_squared_error)
	6.06s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 277.98s of remaining time.
	Ensemble Weights: {'CatBoost': 0.6, 'LightGBMLarge': 0.3, 'NeuralNetTorch': 0.1}
	-43144.54	 = Validation score   (-root_mean_squared_error)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 322.05s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 13725.0 rows/s (1652 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/content/ag_california")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7ad67e0033e0>

In [7]:
# Evaluate and show leaderboard
predictor.leaderboard(val_df, silent=True)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-43472.896109,-43144.539973,root_mean_squared_error,0.379012,0.120364,288.727706,0.007651,0.000332,0.009621,2,True,10
1,CatBoost,-43950.593121,-43677.842399,root_mean_squared_error,0.039811,0.00782,76.815349,0.039811,0.00782,76.815349,1,True,4
2,LightGBM,-44926.744799,-45803.274648,root_mean_squared_error,0.088154,0.036058,1.306887,0.088154,0.036058,1.306887,1,True,2
3,LightGBMLarge,-45320.1199,-44643.227709,root_mean_squared_error,0.307005,0.098414,6.058413,0.307005,0.098414,6.058413,1,True,9
4,XGBoost,-46484.38201,-45914.015008,root_mean_squared_error,0.109428,0.040472,2.046815,0.109428,0.040472,2.046815,1,True,7
5,LightGBMXT,-47277.394953,-48253.761251,root_mean_squared_error,0.475646,0.174954,4.182359,0.475646,0.174954,4.182359,1,True,1
6,RandomForestMSE,-50579.924515,-50122.57932,root_mean_squared_error,2.286667,0.099165,6.197082,2.286667,0.099165,6.197082,1,True,3
7,NeuralNetTorch,-52008.665126,-50744.589428,root_mean_squared_error,0.024545,0.013798,205.844323,0.024545,0.013798,205.844323,1,True,8
8,ExtraTreesMSE,-53115.276395,-53232.310093,root_mean_squared_error,0.219906,0.077186,1.542593,0.219906,0.077186,1.542593,1,True,5
9,NeuralNetFastAI,-53240.262832,-52692.636403,root_mean_squared_error,0.077688,0.019984,12.312518,0.077688,0.019984,12.312518,1,True,6


In [8]:
# Predict on validation and show RMSE/R2
predictor.evaluate(val_df)


{'root_mean_squared_error': np.float64(-43472.896108689085),
 'mean_squared_error': -1889892696.0768745,
 'mean_absolute_error': -27954.984219809834,
 'r2': 0.8557784075461787,
 'pearsonr': 0.9250970048185715,
 'median_absolute_error': np.float64(-17068.0703125)}