Merge 95c597e into 9648351

scikit-learn · Dec 16, 2015 · 001a225 · 001a225
2 parents 9648351 + 95c597e
commit 001a225
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 79 deletions.
diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/ensemble/plot_partial_dependence.py
@@ -59,64 +59,54 @@
 from sklearn.datasets.california_housing import fetch_california_housing
 
 
-def main():
-    # fetch California housing dataset
-    try:
-        cal_housing = fetch_california_housing()
-    except HTTPError:
-        print("Failed downloading california housing data.")
-        return
-
-    # split 80/20 train-test
-    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
-                                                        cal_housing.target,
-                                                        test_size=0.2,
-                                                        random_state=1)
-    names = cal_housing.feature_names
-
-    print('_' * 80)
-    print("Training GBRT...")
-    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
-                                    learning_rate=0.1, loss='huber',
-                                    random_state=1)
-    clf.fit(X_train, y_train)
-    print("done.")
-
-    print('_' * 80)
-    print('Convenience plot with ``partial_dependence_plots``')
-    print
-
-    features = [0, 5, 1, 2, (5, 1)]
-    fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
-                                       n_jobs=3, grid_resolution=50)
-    fig.suptitle('Partial dependence of house value on nonlocation features\n'
-                 'for the California housing dataset')
-    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
-
-    print('_' * 80)
-    print('Custom 3d plot via ``partial_dependence``')
-    print
-    fig = plt.figure()
-
-    target_feature = (1, 5)
-    pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
-                                               X=X_train, grid_resolution=50)
-    XX, YY = np.meshgrid(x_axis, y_axis)
-    Z = pdp.T.reshape(XX.shape).T
-    ax = Axes3D(fig)
-    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
-    ax.set_xlabel(names[target_feature[0]])
-    ax.set_ylabel(names[target_feature[1]])
-    ax.set_zlabel('Partial dependence')
-    #  pretty init view
-    ax.view_init(elev=22, azim=122)
-    plt.colorbar(surf)
-    plt.suptitle('Partial dependence of house value on median age and '
-                 'average occupancy')
-    plt.subplots_adjust(top=0.9)
-
-    plt.show()
-
-
-if __name__ == "__main__":
-    main()
+cal_housing = fetch_california_housing()
+
+# split 80/20 train-test
+X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
+                                                    cal_housing.target,
+                                                    test_size=0.2,
+                                                    random_state=1)
+names = cal_housing.feature_names
+
+print('_' * 80)
+print("Training GBRT...")
+clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
+                                learning_rate=0.1, loss='huber',
+                                random_state=1)
+clf.fit(X_train, y_train)
+print("done.")
+
+print('_' * 80)
+print('Convenience plot with ``partial_dependence_plots``')
+print
+
+features = [0, 5, 1, 2, (5, 1)]
+fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
+                                   n_jobs=3, grid_resolution=50)
+fig.suptitle('Partial dependence of house value on nonlocation features\n'
+             'for the California housing dataset')
+plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
+
+print('_' * 80)
+print('Custom 3d plot via ``partial_dependence``')
+print
+fig = plt.figure()
+
+target_feature = (1, 5)
+pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
+                                           X=X_train, grid_resolution=50)
+XX, YY = np.meshgrid(x_axis, y_axis)
+Z = pdp.T.reshape(XX.shape).T
+ax = Axes3D(fig)
+surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
+ax.set_xlabel(names[target_feature[0]])
+ax.set_ylabel(names[target_feature[1]])
+ax.set_zlabel('Partial dependence')
+#  pretty init view
+ax.view_init(elev=22, azim=122)
+plt.colorbar(surf)
+plt.suptitle('Partial dependence of house value on median age and '
+             'average occupancy')
+plt.subplots_adjust(top=0.9)
+
+plt.show()
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -21,16 +21,17 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from io import BytesIO
+import os
 from os.path import exists
 from os import makedirs
-from zipfile import ZipFile
+import tarfile
+
 try:
     # Python 2
-    from urllib2 import urlopen
+    import urllib.request as urllib
 except ImportError:
     # Python 3+
-    from urllib.request import urlopen
+    import urllib
 
 import numpy as np
 
@@ -39,8 +40,8 @@
 from ..externals import joblib
 
 
-DATA_URL = "http://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&"\
-           "file=index&req=getit&lid=83"
+DATA_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
+ARCHIVE_NAME = "cal_housing.tgz"
 TARGET_FILENAME = "cal_housing.pkz"
 
 # Grab the module-level docstring to use as a description of the
@@ -89,18 +90,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
         makedirs(data_home)
     filepath = _pkl_filepath(data_home, TARGET_FILENAME)
     if not exists(filepath):
-        print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
-        fhandle = urlopen(DATA_URL)
-        buf = BytesIO(fhandle.read())
-        zip_file = ZipFile(buf)
-        try:
-            cadata_fd = zip_file.open('cadata.txt', 'r')
-            cadata = BytesIO(cadata_fd.read())
-            # skip the first 27 lines (documentation)
-            cal_housing = np.loadtxt(cadata, skiprows=27)
-            joblib.dump(cal_housing, filepath, compress=6)
-        finally:
-            zip_file.close()
+        archive_path = os.path.join(data_home, ARCHIVE_NAME)
+        print('downloading Cal. housing from %s to %s' % (DATA_URL, archive_path))
+        urllib.urlretrieve(DATA_URL, archive_path)
+        tarfile.open(archive_path, "r:gz").extractall(path=data_home)
+        os.remove(archive_path)
+
+        data_path = os.path.join(data_home, 'CaliforniaHousing',
+                                 'cal_housing.data')
+        cal_housing = np.loadtxt(data_path, delimiter=',')
+        # Columns are not in the same order compared to the previous
+        # URL resource on lib.stat.cmu.edu
+        columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
+        cal_housing = cal_housing[:, columns_index]
+        joblib.dump(cal_housing, filepath, compress=6)
     else:
         cal_housing = joblib.load(filepath)