/
mds.py
276 lines (225 loc) · 8.44 KB
/
mds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
"""
Non-metric Multdimensional Scaling
"""
import numpy as np
from ..base import BaseEstimator
from ..metrics import euclidean_distances
# XXX FIXME: should we use proximities or similarities ??
def PAV(distances, similarities, copy=False, verbose=False):
"""
Pool adjancent violators
Parameters
----------
distances:
similarities:
copy: boolean, optional
"""
# First approach for ties: ignore them. The multidimensional scaling won't
# enforce that points with equal similarity be at equal distance.
dtype = [('x', float), ('y', float)]
el = np.array([(i, j) for i, j in zip(similarities, distances)],
dtype=dtype)
indxs = el.argsort(order=['x', 'y'])
new_blocks = range(len(indxs))
block = []
sort = True
while sort:
sort = False
blocks = new_blocks[:]
new_blocks = []
block = []
el = distances[indxs[blocks[:-1]]] <= distances[indxs[blocks[1:]]] + \
np.finfo(np.float).resolution
for i, element in enumerate(el):
if not element:
sort = True
block.append(blocks[i])
elif element and block:
tmp = np.arange(block[0], blocks[i + 1])
distances[indxs[tmp]] = distances[indxs[tmp]].mean()
new_blocks.append(block[0])
block = []
else:
new_blocks.append(blocks[i])
# The last element
if block:
tmp = np.arange(block[0], len(similarities))
distances[indxs[tmp]] = distances[indxs[tmp]].mean()
new_blocks.append(block[0])
else:
new_blocks.append(len(similarities) - 1)
return distances
def guttman_image_ranking(distances, similarities, verbose=False):
"""
Guttman image ranking
"""
dtype = [('x', float), ('y', float)]
el = np.array([(i, j) for i, j in zip(similarities, distances)],
dtype=dtype)
indxs = el.argsort(order=['x', 'y'])
dis_arg = distances.argsort()
disparities = distances.copy()
for i, j in zip(indxs, dis_arg):
disparities[i] = distances[j]
return disparities
def _smacof_single(similarities, metric=True, p=2, init=None,
max_iter=300, verbose=0, eps=1e-3):
"""
Computes multidimensional scaling using SMACOF algorithm
Parameters
----------
similarities: symmetric ndarray, shape [n * n]
similarities between the points
metric: boolean, optional, default: True
compute metric or nonmetric SMACOF algorithm
p: int, optional, default: 2
number of dimension in which to immerse the similarities
overridden if initial array is provided.
init: {None or ndarray}
if None, randomly chooses the initial configuration
if ndarray, initialize the SMACOF algorithm with this array
max_iter: int, optional, default: 300
Maximum number of iterations of the SMACOF algorithm for a single run
verbose: int, optional, default: 0
level of verbosity
eps: float, optional, default: 1e-6
relative tolerance w.r.t stress to declare converge
Returns
-------
X, stress: ndarray (n, p), float
coordinates of the n points in a p-space
"""
n = similarities.shape[0]
if similarities.shape[0] != similarities.shape[1]:
raise ValueError("similarities must be a square array (shape=%d)" % \
n)
if np.any(similarities != similarities.T):
raise ValueError("similarities must be symmetric")
sim_flat = ((1 - np.tri(n)) * similarities).flatten()
sim_flat_w = sim_flat[sim_flat != 0]
if init is None:
# Randomly choose initial configuration
X = np.random.random(size=(n, p))
else:
# overrides the parameter p
p = init.shape[1]
if n != init.shape[0]:
raise ValueError("init matrix should be of shape (%d, %d)" % \
(n, p))
X = init
old_stress = None
for it in range(max_iter):
# Compute distance and monotonic regression
dis = euclidean_distances(X)
if metric:
disparities = similarities
else:
dis_flat = dis.flatten()
# similarities with 0 are considered as missing values
dis_flat_w = dis_flat[sim_flat != 0]
# Compute the disparities using a monotonic regression
disparities_flat = PAV(dis_flat_w,
sim_flat_w)
disparities = dis_flat.copy()
disparities[sim_flat != 0] = disparities_flat
disparities = disparities.reshape((n, n))
disparities *= np.sqrt((n * (n - 1) / 2) / \
(disparities ** 2).sum())
# Compute stress
stress = ((dis.flatten() - \
disparities.flatten()) ** 2).sum() / 2
# Update X using the Guttman transform
ratio = disparities / dis
ratio[np.isinf(ratio) | np.isnan(ratio)] = 0
B = - ratio + np.diag(ratio.sum(axis=1))
X = 1. / n * np.dot(B, X)
if verbose == 2:
print 'it: %d, stress %s' % (it, stress)
if old_stress is not None:
if(old_stress - stress) < eps:
if verbose:
print 'breaking at iteration %d with stress %s' % (it,
stress)
break
old_stress = stress
return X, stress
def smacof(similarities, metric=True, p=2, init=None, n_init=8, n_jobs=1,
max_iter=300, verbose=0, eps=1e-3, random_state=None):
"""
Computes multidimensional scaling using SMACOF algorithm
Parameters
----------
similarities: symmetric ndarray, shape [n * n]
similarities between the points
metric: boolean, optional, default: True
compute metric or nonmetric SMACOF algorithm
p: int, optional, default: 2
number of dimension in which to immerse the similarities
overridden if initial array is provided.
init: {None or ndarray}
if None, randomly chooses the initial configuration
if ndarray, initialize the SMACOF algorithm with this array
n_init: int, optional, default: 8
Number of time the smacof algorithm will be run with different
initialisation. The final results will be the best output of the
n_init consecutive runs in terms of stress.
max_iter: int, optional, default: 300
Maximum number of iterations of the SMACOF algorithm for a single run
verbose: int, optional, default: 0
level of verbosity
eps: float, optional, default: 1e-6
relative tolerance w.r.t stress to declare converge
Returns
-------
X: ndarray (n, p)
coordinates of the n points in a p-space
"""
best_pos, best_stress = None, None
if hasattr(init, '__array__'):
# TODO
init = np.asarray(init).copy()
if n_jobs == 1:
for it in range(n_init):
pos, stress = _smacof_single(similarities, metric=metric, p=p,
init=init, max_iter=max_iter,
verbose=verbose, eps=eps)
if best_stress is None or stress < best_stress:
best_stress = stress
best_pos = pos.copy()
else:
# TODD
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
return best_pos, best_stress
class MDS(BaseEstimator):
"""
Multidimensional scaling
Parameters
----------
Notes
-----
"""
# TODO
def __init__(self, p=2, metric=True, init=None, n_init=8,
max_iter=300, verbose=0, eps=1e-3, n_jobs=1):
self.p = p
self.metric = metric
self.init = init
self.n_init = n_init
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
"""
self.X, self.stress = smacof(X, metric=self.metric, p=self.p,
init=self.init,
n_init=self.n_init,
max_iter=self.max_iter,
verbose=self.verbose,
eps=self.eps)
return self
def predict(self, X):
"""
"""
# TODO