Skip to content

Commit

Permalink
Merge 95c597e into 9648351
Browse files Browse the repository at this point in the history
  • Loading branch information
lesteve committed Dec 16, 2015
2 parents 9648351 + 95c597e commit 001a225
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 79 deletions.
112 changes: 51 additions & 61 deletions examples/ensemble/plot_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,64 +59,54 @@
from sklearn.datasets.california_housing import fetch_california_housing


def main():
# fetch California housing dataset
try:
cal_housing = fetch_california_housing()
except HTTPError:
print("Failed downloading california housing data.")
return

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
cal_housing.target,
test_size=0.2,
random_state=1)
names = cal_housing.feature_names

print('_' * 80)
print("Training GBRT...")
clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
learning_rate=0.1, loss='huber',
random_state=1)
clf.fit(X_train, y_train)
print("done.")

print('_' * 80)
print('Convenience plot with ``partial_dependence_plots``')
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
'for the California housing dataset')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = plt.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
# pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.suptitle('Partial dependence of house value on median age and '
'average occupancy')
plt.subplots_adjust(top=0.9)

plt.show()


if __name__ == "__main__":
main()
cal_housing = fetch_california_housing()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
cal_housing.target,
test_size=0.2,
random_state=1)
names = cal_housing.feature_names

print('_' * 80)
print("Training GBRT...")
clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
learning_rate=0.1, loss='huber',
random_state=1)
clf.fit(X_train, y_train)
print("done.")

print('_' * 80)
print('Convenience plot with ``partial_dependence_plots``')
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
'for the California housing dataset')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = plt.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
# pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.suptitle('Partial dependence of house value on median age and '
'average occupancy')
plt.subplots_adjust(top=0.9)

plt.show()
39 changes: 21 additions & 18 deletions sklearn/datasets/california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@
# Authors: Peter Prettenhofer
# License: BSD 3 clause

from io import BytesIO
import os
from os.path import exists
from os import makedirs
from zipfile import ZipFile
import tarfile

try:
# Python 2
from urllib2 import urlopen
import urllib.request as urllib
except ImportError:
# Python 3+
from urllib.request import urlopen
import urllib

import numpy as np

Expand All @@ -39,8 +40,8 @@
from ..externals import joblib


DATA_URL = "http://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&"\
"file=index&req=getit&lid=83"
DATA_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
ARCHIVE_NAME = "cal_housing.tgz"
TARGET_FILENAME = "cal_housing.pkz"

# Grab the module-level docstring to use as a description of the
Expand Down Expand Up @@ -89,18 +90,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
makedirs(data_home)
filepath = _pkl_filepath(data_home, TARGET_FILENAME)
if not exists(filepath):
print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
fhandle = urlopen(DATA_URL)
buf = BytesIO(fhandle.read())
zip_file = ZipFile(buf)
try:
cadata_fd = zip_file.open('cadata.txt', 'r')
cadata = BytesIO(cadata_fd.read())
# skip the first 27 lines (documentation)
cal_housing = np.loadtxt(cadata, skiprows=27)
joblib.dump(cal_housing, filepath, compress=6)
finally:
zip_file.close()
archive_path = os.path.join(data_home, ARCHIVE_NAME)
print('downloading Cal. housing from %s to %s' % (DATA_URL, archive_path))
urllib.urlretrieve(DATA_URL, archive_path)
tarfile.open(archive_path, "r:gz").extractall(path=data_home)
os.remove(archive_path)

data_path = os.path.join(data_home, 'CaliforniaHousing',
'cal_housing.data')
cal_housing = np.loadtxt(data_path, delimiter=',')
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]
joblib.dump(cal_housing, filepath, compress=6)
else:
cal_housing = joblib.load(filepath)

Expand Down

0 comments on commit 001a225

Please sign in to comment.