Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Progress logger #1171

Closed
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
655ce61
ENH: first sketch of logging framework
GaelVaroquaux Sep 15, 2012
ff9ed10
ENH: use logging framework in grid_search
GaelVaroquaux Sep 16, 2012
d73b792
ENH: make logging framework pickeable
GaelVaroquaux Sep 16, 2012
ee5ff98
ENH: pickling of our loggers
GaelVaroquaux Sep 17, 2012
09cc6ea
COSMIT: ProgressLog -> ProgressLogger
GaelVaroquaux Sep 17, 2012
6480b83
BUG: fix renaming
GaelVaroquaux Sep 17, 2012
7b67fe0
ENH: Add logging to k_means
GaelVaroquaux Sep 17, 2012
4f48a4c
ENH: add logger to affinity_propagation
GaelVaroquaux Sep 18, 2012
f80e1e1
API: use *msg_vars in logger.progress
GaelVaroquaux Sep 20, 2012
f964b76
EHH: Logger in SVM
GaelVaroquaux Sep 20, 2012
0a50131
ENH: logger in DPGMM
GaelVaroquaux Sep 20, 2012
979e1f9
ENH: short_message in logger
GaelVaroquaux Sep 20, 2012
cffa19e
ENH: add logging to __init__
GaelVaroquaux Sep 20, 2012
881828c
BUG: fix bug in previous commit
GaelVaroquaux Sep 20, 2012
3330310
BUG: fix bug in logging + SVM
GaelVaroquaux Sep 20, 2012
5512f02
ENH: better verbosity in lars_path
GaelVaroquaux Sep 20, 2012
ff43901
ENH: add logging to graph_lasso
GaelVaroquaux Sep 20, 2012
e25073c
ENH: Logger in lfw
GaelVaroquaux Sep 20, 2012
725a16b
MISC: Address @ogrisel's comment on info
GaelVaroquaux Sep 20, 2012
83fdce1
MISC: simpler verbosity control
GaelVaroquaux Sep 20, 2012
b6b336a
COSMIT: remove 'I:' from logging message
GaelVaroquaux Sep 20, 2012
562558f
COSMIT: comment
GaelVaroquaux Sep 20, 2012
550b805
COSMIT: less noise in tests
GaelVaroquaux Sep 20, 2012
c90433f
COSMIT: More clean up verbosity in tests
GaelVaroquaux Sep 20, 2012
b4d75c1
ENH: logging framework in MDS
GaelVaroquaux Sep 21, 2012
dd6b88d
MISC: use assert_equal
GaelVaroquaux Sep 21, 2012
53f0362
ENH: logger in naive_bayes
GaelVaroquaux Sep 21, 2012
5dafb44
ENH: logger in coordinate_descent
GaelVaroquaux Sep 21, 2012
80ca50b
ENH: logging in gaussian_process
GaelVaroquaux Sep 21, 2012
1060455
ENH: log errors in Gaussian process
GaelVaroquaux Sep 23, 2012
70ffd2b
ENH: Logger framework in decomposition
GaelVaroquaux Sep 23, 2012
17c921d
ENH: add logger to random forest
GaelVaroquaux Sep 23, 2012
0881534
ENH: logger in robust_covariance
GaelVaroquaux Sep 25, 2012
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions examples/internal/object_log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Small example showing recursive logging in an object hierarchy.
"""

from time import sleep
import itertools

from sklearn.progress_logger import HasLogger
from sklearn.externals.joblib import Parallel, delayed

FIRST_NAMES = itertools.cycle(['Jane', 'Joe', 'Jack'])

def do_work(logger, msg):
logger.progress('Working', short_message=msg)


class Employee(HasLogger):

def __init__(self, name='Joe Average', verbose=False):
self.name = name
self.verbose = verbose

def work(self, chore_msg):
log = self._get_logger()
sleep(.2)
Parallel(n_jobs=-1)(delayed(do_work)(log, '.')
for _ in range(10))
log.progress('%s says "Done my chores %s"',
self.name, chore_msg)


class Boss(HasLogger):

def __init__(self, n_employees=3, verbose=False):
self.verbose = verbose
self.n_employees = n_employees

def yell(self):
log = self._get_logger()
log.progress('Get to work!!')
employes = [Employee(name='%s Average' % n,
verbose=log.clone())
for _, n in zip(range(self.n_employees),
FIRST_NAMES)]

for employe in employes:
employe.work('code')


if __name__ == '__main__':
boss = Boss(verbose=2)
boss.yell()

from sklearn.progress_logger import setup_logger
import logging
setup_logger('__main__', level=logging.DEBUG, display_name=True,
time_stamp=True)
boss.yell()
4 changes: 3 additions & 1 deletion sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,14 @@ def setup_module(module):
import os
import numpy as np
import random
from .progress_logger import get_logger

# It could have been provided in the environment
_random_seed = os.environ.get('SKLEARN_SEED', None)
if _random_seed is None:
_random_seed = np.random.uniform()*(2**31-1)
_random_seed = int(_random_seed)
print "I: Seeding RNGs with %r" % _random_seed
get_logger(verbosity=1000).progress("I: Seeding RNGs with %r",
_random_seed)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use a simple info instead of progress here. The zoomable progress report does not make sense in that context.

np.random.seed(_random_seed)
random.seed(_random_seed)
3 changes: 2 additions & 1 deletion sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scipy import sparse

from .metrics import r2_score
from .progress_logger import HasLogger


###############################################################################
Expand Down Expand Up @@ -149,7 +150,7 @@ def _pprint(params, offset=0, printer=repr):


###############################################################################
class BaseEstimator(object):
class BaseEstimator(HasLogger):
"""Base class for all estimators in scikit-learn

Notes
Expand Down
8 changes: 4 additions & 4 deletions sklearn/cluster/affinity_propagation_.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import warnings

from ..base import BaseEstimator, ClusterMixin
from ..progress_logger import get_logger
from ..utils import as_float_array
from ..metrics import euclidean_distances

Expand Down Expand Up @@ -70,6 +71,7 @@ def affinity_propagation(S, preference=None, p=None, convergence_iter=15,
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
"""
logger = get_logger(verbosity=verbose)
S = as_float_array(S, copy=copy)
if convit is not None:
warnings.warn("``convit`` is deprecated and will be removed in"
Expand Down Expand Up @@ -151,12 +153,10 @@ def affinity_propagation(S, preference=None, p=None, convergence_iter=15,
unconverged = np.sum((se == convergence_iter) +\
(se == 0)) != n_samples
if (not unconverged and (K > 0)) or (it == max_iter):
if verbose:
print "Converged after %d iterations." % it
logger.progress("Converged after %d iterations.", it)
break
else:
if verbose:
print "Did not converged"
logger.progress("Did not converged")

I = np.where(np.diag(A + R) > 0)[0]
K = I.size # Identify exemplars
Expand Down
61 changes: 29 additions & 32 deletions sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ..base import BaseEstimator, ClusterMixin
from ..metrics.pairwise import euclidean_distances
from ..utils.sparsefuncs import mean_variance_axis0
from ..progress_logger import get_logger
from ..utils import check_arrays
from ..utils import check_random_state
from ..utils import atleast2d_or_csr
Expand Down Expand Up @@ -229,6 +230,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,

"""
random_state = check_random_state(random_state)
logger = get_logger(verbose)

if not k is None:
n_clusters = k
Expand Down Expand Up @@ -267,7 +269,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
for it in range(n_init):
# run a k-means once
labels, inertia, centers = _kmeans_single(
X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
X, n_clusters, max_iter=max_iter, init=init, verbose=logger,
precompute_distances=precompute_distances, tol=tol,
x_squared_norms=x_squared_norms, random_state=random_state)
# determine if these results are the best so far
Expand All @@ -280,7 +282,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(_kmeans_single)(X, n_clusters, max_iter=max_iter,
init=init, verbose=verbose, tol=tol,
init=init, verbose=logger, tol=tol,
precompute_distances=precompute_distances,
x_squared_norms=x_squared_norms,
# Change seed to ensure variety
Expand Down Expand Up @@ -361,14 +363,14 @@ def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++',
the closest centroid for all observations in the training set).
"""
random_state = check_random_state(random_state)
logger = get_logger(verbosity=verbose)
if x_squared_norms is None:
x_squared_norms = _squared_norms(X)
best_labels, best_inertia, best_centers = None, None, None
# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
if verbose:
print 'Initialization complete'
logger.progress('Initialization complete')

# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
Expand All @@ -386,17 +388,16 @@ def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++',
# computation of the means is also called the M-step of EM
centers = _centers(X, labels, n_clusters, distances)

if verbose:
print 'Iteration %i, inertia %s' % (i, inertia)
logger.progress('Iteration %i, inertia %s', i, inertia)

if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia

if np.sum((centers_old - centers) ** 2) < tol:
if verbose:
print 'Converged to similar centers at iteration', i
logger.progress('Converged to similar centers at iteration %i',
i)
break
return best_labels, best_inertia, best_centers

Expand Down Expand Up @@ -751,7 +752,7 @@ def fit(self, X, y=None):

self.cluster_centers_, self.labels_, self.inertia_ = k_means(
X, n_clusters=n_clusters, init=self.init, n_init=self.n_init,
max_iter=self.max_iter, verbose=self.verbose,
max_iter=self.max_iter, verbose=self._get_logger(),
precompute_distances=self.precompute_distances,
tol=self.tol, random_state=self.random_state, copy_x=self.copy_x,
n_jobs=self.n_jobs)
Expand Down Expand Up @@ -899,6 +900,7 @@ def _mini_batch_convergence(model, iteration_idx, n_iterations, tol,
n_samples, centers_squared_diff, batch_inertia,
context, verbose=0):
"""Helper function to encapsulte the early stopping logic"""
logger = get_logger(verbosity=verbose)
# Normalize inertia to be able to compare values when
# batch_size changes
batch_inertia /= model.batch_size
Expand All @@ -920,20 +922,17 @@ def _mini_batch_convergence(model, iteration_idx, n_iterations, tol,
ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha

# Log progress to be able to monitor convergence
if verbose:
progress_msg = (
'Minibatch iteration %d/%d:'
'mean batch inertia: %f, ewa inertia: %f ' % (
logger.progress(
'iteration %d/%d: batch inertia: %.3f, ewa inertia: %.3f',
iteration_idx + 1, n_iterations, batch_inertia,
ewa_inertia))
print progress_msg
ewa_inertia,
)

# Early stopping based on absolute tolerance on squared change of
# centers postion (using EWA smoothing)
if tol > 0.0 and ewa_diff < tol:
if verbose:
print 'Converged (small centers change) at iteration %d/%d' % (
iteration_idx + 1, n_iterations)
logger.progress('Converged (small centers change) at iteration %d/%d',
iteration_idx + 1, n_iterations)
return True

# Early stopping heuristic due to lack of improvement on smoothed inertia
Expand All @@ -946,11 +945,10 @@ def _mini_batch_convergence(model, iteration_idx, n_iterations, tol,
no_improvement += 1

if (model.max_no_improvement is not None
and no_improvement >= model.max_no_improvement):
if verbose:
print ('Converged (lack of improvement in inertia)'
' at iteration %d/%d' % (
iteration_idx + 1, n_iterations))
and no_improvement >= model.max_no_improvement):
logger.progress('Converged (no improvement in inertia)'
' at iteration %d/%d',
iteration_idx + 1, n_iterations)
return True

# update the convergence context to maintain state across sucessive calls:
Expand Down Expand Up @@ -1068,6 +1066,7 @@ def fit(self, X, y=None):
Coordinates of the data points to cluster
"""
self.random_state = check_random_state(self.random_state)
logger = self._get_logger()
X = check_arrays(X, sparse_format="csr", copy=False,
check_ccontiguous=True, dtype=np.float64)[0]
n_samples, n_features = X.shape
Expand Down Expand Up @@ -1112,9 +1111,8 @@ def fit(self, X, y=None):
# perform several inits with random sub-sets
best_inertia = None
for init_idx in range(self.n_init):
if self.verbose:
print "Init %d/%d with method: %s" % (
init_idx + 1, self.n_init, self.init)
logger.progress("Init %d/%d with method: %s",
init_idx + 1, self.n_init, self.init)
counts = np.zeros(self.n_clusters, dtype=np.int32)

# TODO: once the `k_means` function works with sparse input we
Expand All @@ -1138,9 +1136,8 @@ def fit(self, X, y=None):
# the common validation set
_, inertia = _labels_inertia(X_valid, x_squared_norms_valid,
cluster_centers)
if self.verbose:
print "Inertia for init %d/%d: %f" % (
init_idx + 1, self.n_init, inertia)
logger.progress("Inertia for init %d/%d: %f",
init_idx + 1, self.n_init, inertia)
if best_inertia is None or inertia < best_inertia:
self.cluster_centers_ = cluster_centers
self.counts_ = counts
Expand All @@ -1167,12 +1164,12 @@ def fit(self, X, y=None):
if _mini_batch_convergence(
self, iteration_idx, n_iterations, tol, n_samples,
centers_squared_diff, batch_inertia, convergence_context,
verbose=self.verbose):
verbose=logger):
break

if self.compute_labels:
if self.verbose:
print 'Computing label assignements and total inertia'
logger.progress(
'Computing label assignements and total inertia')
self.labels_, self.inertia_ = _labels_inertia(
X, x_squared_norms, self.cluster_centers_)

Expand Down
Loading