Skip to content

Commit

Permalink
Merge pull request #98 from leesteinberg/master
Browse files Browse the repository at this point in the history
Incorporate bounds into covering class, satisfying Issue #97. This also incorporates commits from PR #87.
  • Loading branch information
Nathaniel Saul committed Jul 12, 2018
2 parents 974c9fb + 46a9c77 commit 27f7167
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 91 deletions.
65 changes: 61 additions & 4 deletions kmapper/cover.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,19 @@ def __init__(self,
perc_overlap=0.2,
# Deprecated parameters:
nr_cubes=None,
overlap_perc=None):
overlap_perc=None,
limits=None):
"""
limits: Numpy Array (n_dim,2)
(lower bound, upper bound) for every dimension
If a value is set to np.float('inf'), the bound will be assumed to be the min/max value of the dimension
Also, if limits == None, the limits are defined by the maximum and minimum value of the lens
for all dimensions.
i.e.
[[min_1, max_1],
[min_2, max_2],
[min_3, max_3]]
"""

self.n_cubes = nr_cubes if nr_cubes else n_cubes
self.perc_overlap = overlap_perc if overlap_perc else perc_overlap
Expand All @@ -25,6 +37,15 @@ def __init__(self,
warnings.warn(
"Arguements `overlap_perc` and `nr_cubes` have been replaced with `perc_overlap` and `n_cubes`. Use `perc_overlap` and `n_cubes` instead. They will be removed in future releases.", DeprecationWarning)

self.limits = limits

# Check limits can actually be handled and are set appropriately
NoneType = type(None)
assert isinstance(self.limits, (list, np.ndarray, type(None))), 'limits should either be an array or None'
if isinstance(self.limits, (list, np.ndarray)):
self.limits = np.array(self.limits)
assert self.limits.shape[1] == 2, 'limits should be (n_dim,2) in shape'

def define_bins(self, data):
"""Returns an iterable of all bins in the cover.
Expand All @@ -36,8 +57,36 @@ def define_bins(self, data):
"""

indexless_data = data[:, 1:]
bounds = (np.min(indexless_data, axis=0),
np.max(indexless_data, axis=0))

# Find upper and lower bounds of bins using self.limits
# If array, use the values in the array
# If None, use the maximum and minimum values in the lens

# If self.limits is array-like
if isinstance(self.limits, np.ndarray):
# limits_array is used so we can change the values of self.limits from None to the min/max
limits_array = np.zeros(self.limits.shape)
limits_array[:, 0] = np.min(indexless_data, axis=0)
limits_array[:, 1] = np.max(indexless_data, axis=0)
limits_array[self.limits != np.float('inf')] = 0
self.limits[self.limits == np.float('inf')] = 0
bounds_arr = self.limits + limits_array
""" bounds_arr[i,j] = self.limits[i,j] if self.limits[i,j] == inf
bounds_arr[i,j] = max/min(indexless_data[i]) if self.limits == inf """
bounds = (bounds_arr[:, 0], bounds_arr[:, 1])

# Check new bounds are actually sensible - do they cover the range of values in the dataset?
if not ((np.min(indexless_data, axis=0) >= bounds_arr[:, 0]).all() or
(np.max(indexless_data, axis=0) <= bounds_arr[:, 1]).all()):
warnings.warn('The limits given do not cover the entire range of the lens functions\n' + \
'Actual Minima: %s\tInput Minima: %s\n' % (
np.min(indexless_data, axis=0), bounds_arr[:, 0]) + \
'Actual Maxima: %s\tInput Maxima: %s\n' % (
np.max(indexless_data, axis=0), bounds_arr[:, 1]))

else: # It must be None, as we checked to see if it is array-like or None in __init__
bounds = (np.min(indexless_data, axis=0),
np.max(indexless_data, axis=0))

# We chop up the min-max column ranges into 'n_cubes' parts
self.chunk_dist = (bounds[1] - bounds[0]) / self.n_cubes
Expand All @@ -47,6 +96,8 @@ def define_bins(self, data):

# We find our starting point
self.d = bounds[0]
# And our ending point (for testing)
self.end = bounds[1]

# Use a dimension index array on the projected X
# (For now this uses the entire dimensionality, but we keep for experimentation)
Expand All @@ -62,7 +113,6 @@ def define_bins(self, data):

coordinates = map(np.asarray, product(
*(range(i) for i in cubes)))

return coordinates

def find_entries(self, data, cube, verbose=0):
Expand Down Expand Up @@ -94,3 +144,10 @@ def find_entries(self, data, cube, verbose=0):
hypercube = data[np.invert(np.any(entries == False, axis=1))]

return hypercube


class CubicalCover(Cover):
"""
Explicit definition of a cubical cover as the default behavior of the cover class
"""
pass
59 changes: 30 additions & 29 deletions kmapper/kmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,20 @@ class KeplerMapper(object):
"""

def __init__(self, verbose=0):
"""
Inputs
======
verbose: int, default is 0
Logging level. Currently 3 levels (0,1,2) are supported.
- for no logging, set `verbose=0`,
- for some logging, set `verbose=1`,
- for complete logging, set `verbose=2`
"""


# TODO: move as many of the arguments from fit_transform and map into here.
self.verbose = verbose
self.chunk_dist = []
self.overlap_dist = []
self.d = []
self.projection = None
self.scaler = None
self.cover = None
Expand All @@ -59,7 +68,10 @@ def project(self, X, projection="sum", scaler=preprocessing.MinMaxScaler(), dist
Projection parameter is either a string, a Scikit-learn class with fit_transform, like manifold.TSNE(), or a list of dimension indices. A string from ["sum", "mean", "median", "max", "min", "std", "dist_mean", "l2norm", "knn_distance_n"]. If using knn_distance_n write the number of desired neighbors in place of n: knn_distance_5 for summed distances to 5 nearest neighbors. Default = "sum".
scaler :
Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use None for no scaling. Default = preprocessing.MinMaxScaler() if None, do no scaling, else apply scaling to the projection. Default: Min-Max scaling distance_matrix: False or any of: ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean", "hamming", "jaccard", "kulsinski", "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"]. If False do nothing, else create a squared distance matrix with the chosen metric, before applying the projection.
Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use None for no scaling. Default = preprocessing.MinMaxScaler() if None, do no scaling, else apply scaling to the projection. Default: Min-Max scaling
distance_matrix:
False or any of: ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean", "hamming", "jaccard", "kulsinski", "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"]. If False do nothing, else create a squared distance matrix with the chosen metric, before applying the projection.
Returns
-------
Expand Down Expand Up @@ -139,26 +151,6 @@ def dist_mean(X, axis=1):
if projection in projection_funcs.keys():
X = projection_funcs[projection](X, axis=1).reshape((X.shape[0], 1))

# if projection == "sum": # sum of row
# X = np.sum(X, axis=1).reshape((X.shape[0], 1))
# if projection == "mean": # mean of row
# X = np.mean(X, axis=1).reshape((X.shape[0], 1))
# if projection == "median": # median of row
# X = np.median(X, axis=1).reshape((X.shape[0], 1))
# if projection == "max": # max of row
# X = np.max(X, axis=1).reshape((X.shape[0], 1))
# if projection == "min": # min of row
# X = np.min(X, axis=1).reshape((X.shape[0], 1))
# if projection == "std": # std of row
# X = np.std(X, axis=1).reshape((X.shape[0], 1))
# if projection == "l2norm":
# X = np.linalg.norm(X, axis=1).reshape((X.shape[0], 1))

# if projection == "dist_mean": # Distance of x to mean of X
# X_mean = np.mean(X, axis=0)
# X = np.sum(np.sqrt((X - X_mean)**2),
# axis=1).reshape((X.shape[0], 1))

if "knn_distance_" in projection:
n_neighbors = int(projection.split("_")[2])
if self.distance_matrix: # We use the distance matrix for finding neighbors
Expand Down Expand Up @@ -249,6 +241,7 @@ def map(self,
clusterer=cluster.DBSCAN(eps=0.5, min_samples=3),
cover=Cover(n_cubes=10, perc_overlap=0.1),
nerve=GraphNerve(),
precomputed=False,

# These arguments are all deprecated
overlap_perc=None,
Expand All @@ -273,15 +266,19 @@ def map(self,
nerve: kmapper.Nerve
Nerve builder implementing `__call__(nodes)` API
precomputed : Boolean
Tell Mapper whether the data that you are clustering on is a precomputed distance matrix. If set to
`True`, the assumption is that you are also telling your `clusterer` that `metric='precomputed'` (which
is an argument for DBSCAN among others), which
will then cause the clusterer to expect a square distance matrix for each hypercube. `precomputed=True` will give a square matrix
to the clusterer to fit on for each hypercube.
nr_cubes: Int (Deprecated)
The number of intervals/hypercubes to create. Default = 10. (DeprecationWarning: define Cover explicitly in future versions)
overlap_perc: Float (Deprecated)
The percentage of overlap "between" the intervals/hypercubes. Default = 0.1. (DeprecationWarning: define Cover explicitly in future versions)
Returns
=======
simplicial_complex : dict
Expand Down Expand Up @@ -370,12 +367,16 @@ def map(self,

# If at least min_cluster_samples samples inside the hypercube
if hypercube.shape[0] >= min_cluster_samples:

# Cluster the data point(s) in the cube, skipping the id-column
# Note that we apply clustering on the inverse image (original data samples) that fall inside the cube.
ids = [int(nn) for nn in hypercube[:, 0]]
X_cube = X[ids]

X_cube = X[[int(nn) for nn in hypercube[:, 0]]]

clusterer.fit(X_cube[:, 1:])
fit_data = X_cube[:, 1:]
if precomputed:
fit_data = fit_data[:, ids]
clusterer.fit(fit_data)

if self.verbose > 1:
print("Found %s clusters in cube_%s\n" % (
Expand Down
137 changes: 79 additions & 58 deletions test/test_coverer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,73 @@
from kmapper.cover import Cover




@pytest.mark.parametrize('CoverClass', [Cover])
class TestCoverBasic():
def test_cube_dim(self, CoverClass):

data = np.arange(30).reshape(10, 3)
c = CoverClass(n_cubes=10)
cubes = c.define_bins(data)

assert all(len(cube) == 2 for cube in cubes)

def test_cube_count(self, CoverClass):
data = np.arange(30).reshape(10, 3)
c = CoverClass(n_cubes=10)
cubes = c.define_bins(data)

assert len(list(cubes)) == 10**2, "idx column is ignored"

def test_single_dim(self, CoverClass):
data = np.arange(20).reshape(10, 2)
c = CoverClass(n_cubes=10)
cubes = c.define_bins(data)

assert all(len(cube) == 1 for cube in cubes)

def test_nr_dimensions(self, CoverClass):
data = np.arange(30).reshape(10, 3)

c = CoverClass(n_cubes=10)
_ = c.define_bins(data)
assert c.nr_dimensions == 2

def test_entries_even(self, CoverClass):
data = np.arange(40).reshape(20, 2)

cover = CoverClass(n_cubes=10)
cubes = cover.define_bins(data)

for cube in cubes:
entries = cover.find_entries(data, cube)
assert len(entries) >= 2

def test_cubes_overlap(self, CoverClass):
data = np.arange(40).reshape(20, 2)

cover = CoverClass(n_cubes=10)
cubes = cover.define_bins(data)

entries = []
for cube in cubes:
# turn singleton lists into individual elements
res = [i[0] for i in cover.find_entries(data, cube)]
entries.append(res)

for i, j in zip(range(9), range(1, 10)):
assert set(entries[i]).union(set(entries[j]))

def test_complete_pipeline(self, CoverClass):
# TODO: add a mock that asserts the cover was called appropriately.. or test number of cubes etc.
data, _ = datasets.make_circles()

data = data.astype(np.float64)
mapper = KeplerMapper()
graph = mapper.map(data, coverer=CoverClass())
mapper.visualize(graph)

class TestCover():
def test_diff_overlap_per_dim(self):
data = np.random.rand(100, 10)
Expand All @@ -24,27 +91,6 @@ def test_find_entries_runs_with_diff_bins(self):
cubes = list(c.define_bins(data))
_ = c.find_entries(data, cubes[0])

def test_cube_count(self):
data = np.arange(30).reshape(10, 3)
c = Cover(n_cubes=10)
cubes = c.define_bins(data)

assert len(list(cubes)) == 10**2, "idx column is ignored"

def test_cube_dim(self):

data = np.arange(30).reshape(10, 3)
c = Cover(n_cubes=10)
cubes = c.define_bins(data)

assert all(len(cube) == 2 for cube in cubes)

def test_single_dim(self):
data = np.arange(20).reshape(10, 2)
c = Cover(n_cubes=10)
cubes = c.define_bins(data)

assert all(len(cube) == 1 for cube in cubes)

def test_chunk_dist(self):
data = np.arange(20).reshape(10, 2)
Expand All @@ -55,31 +101,13 @@ def test_chunk_dist(self):
# TODO: this test is really fagile and has magic number, fix.
assert all(i == 1.8 for i in chunks)

def test_nr_dimensions(self):
data = np.arange(30).reshape(10, 3)

c = Cover(n_cubes=10)
_ = c.define_bins(data)
assert c.nr_dimensions == 2

def test_bound_is_min(self):
data = np.arange(30).reshape(10, 3)
cov = Cover(n_cubes=10)
_ = cov.define_bins(data)
bounds = list(zip(cov.d, range(1, 10)))
assert all(b[0] == b[1] for b in bounds)

def test_entries_even(self):
data = np.arange(40).reshape(20, 2)

cover = Cover(n_cubes=10)
cubes = cover.define_bins(data)

for cube in cubes:
entries = cover.find_entries(data, cube)

assert len(entries) >= 2

def test_entries_in_correct_cubes(self):
# TODO: this test is a little hacky

Expand All @@ -95,29 +123,22 @@ def test_entries_in_correct_cubes(self):
assert data[2 * i] in entries[i]
assert data[2 * i + 1] in entries[i]

def test_cubes_overlap(self):
data = np.arange(40).reshape(20, 2)

cover = Cover(n_cubes=10)
cubes = cover.define_bins(data)

entries = []
for cube in cubes:
# turn singleton lists into individual elements
res = [i[0] for i in cover.find_entries(data, cube)]
entries.append(res)
class TestCoverBounds:
def test_bounds(self):
data_vals = np.arange(40).reshape(20,2)
data = np.zeros((20,3))
data[:,0] = np.arange(20,dtype=int) # Index row
data[:,1:3] = data_vals

for i, j in zip(range(9), range(1, 10)):
assert set(entries[i]).union(set(entries[j]))
limits = np.array([[np.float('inf'), np.float('inf')],[-10,100]])
cover = Cover(n_cubes=10, limits=limits)
cubes=cover.define_bins(data)

start = cover.d
end = cover.end
assert np.array_equal(np.array([start, end]), np.array([[0,-10], [38,100]]))

def test_BasicCover():
# TODO: add a mock that asserts the cover was called appropriately.. or test number of cubes etc.
data, _ = datasets.make_circles()

data = data.astype(np.float64)
mapper = KeplerMapper()
graph = mapper.map(data)
mapper.visualize(graph)


0 comments on commit 27f7167

Please sign in to comment.