-
Notifications
You must be signed in to change notification settings - Fork 90
/
_deduplicate.py
246 lines (213 loc) · 8.73 KB
/
_deduplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
Implements deduplication based on clustering string distance matrices.
"""
from collections.abc import Sequence
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from numpy.typing import NDArray
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
def compute_ngram_distance(
unique_words: Sequence[str] | NDArray,
ngram_range: tuple[int, int] = (2, 4),
analyzer: str = "char_wb",
) -> NDArray:
"""Compute the condensed pair-wise n-gram distance between `unique_words`.
Parameters
----------
unique_words : sequence of str
Sequence or array of unique words from the original data.
ngram_range : 2-tuple of int, default=(2,4)
The lower and upper boundaries of the range of n-values for different
n-grams used in the string similarity. All values of `n` such
that ``min_n <= n <= max_n`` will be used.
analyzer : str, default='char_wb'
Analyzer to extract n-grams.
Returns
-------
ndarray
An n-times-(n-1)/2 array of n-gram tf-idf distances between `unique_words`.
Notes
-----
Extracts n-grams of all elements in `unique_words`, calculates the
term frequency-inverse document frequency (TF-IDF) for each n-gram, then
computes the pair-wise Euclidean distance between elements based on their
n-gram TF-IDF representation.
"""
encoded = TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer).fit_transform(
unique_words
)
distance_mat = pdist(encoded.todense(), metric="euclidean")
return distance_mat
def _get_silhouette_avg(Z: NDArray, n_clust: int, redundant_dist: NDArray) -> float:
labels = fcluster(Z, n_clust, criterion="maxclust")
silhouette_avg = silhouette_score(redundant_dist, labels, metric="precomputed")
return silhouette_avg
def _guess_clusters(
Z: NDArray, distance_mat: NDArray, n_jobs: int | None = None
) -> int:
"""Finds the number of clusters that maximize the silhouette score
when clustering `distance_mat`.
Parameters
----------
Z : numpy ndarray
hierarchical linkage matrix, specifies which clusters to merge.
distance_mat : numpy ndarray
distance matrix either in square or condensed form.
Returns
-------
int
number of clusters that maximize the silhouette score.
"""
max_clusters = distance_mat.shape[0]
n_clusters = np.arange(2, max_clusters)
# silhouette score needs a redundant distance matrix
redundant_dist = squareform(distance_mat)
silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(
delayed(_get_silhouette_avg)(Z, n_clust, redundant_dist)
for n_clust in n_clusters
)
return n_clusters[np.argmax(silhouette_scores)]
def _create_spelling_correction(
unique_words: Sequence[str] | NDArray[np.str_],
counts: Sequence[int] | NDArray[np.int_],
clusters: Sequence[int],
) -> pd.Series:
"""
Creates a pandas Series that map each cluster member to the most
frequent cluster member. The assumption is that the most common spelling
is the correct one.
Parameters
----------
unique_words : sequence of str
A sequence or array of unique words in the original data.
counts : sequence of int
A sequence or array of counts of how often each unique word appears in
the original data.
clusters : sequence of int
A sequence of ints, indicating cluster membership of each unique word
in `count_series`.
Returns
-------
pd.Series
Series with unique (original) words as indices and (estimated)
corrected spelling of each word as values.
"""
count_series = pd.Series(counts, index=unique_words)
original_spelling: list[str] = []
corrected_spelling: list[str] = []
for cluster in np.unique(clusters):
sorted_spellings = (
count_series.loc[clusters == cluster]
.sort_values(ascending=False)
.index.tolist()
)
original_spelling.extend(sorted_spellings)
# assumes spelling that occurs most frequently in cluster is correct
corrected_spelling.extend(sorted_spellings[:1] * len(sorted_spellings))
pd_spell_correct = pd.Series(corrected_spelling, index=original_spelling)
return pd_spell_correct
def deduplicate(
X,
*,
n_clusters=None,
ngram_range=(2, 4),
analyzer="char_wb",
linkage_method="average",
n_jobs=None,
):
"""Deduplicate categorical data by hierarchically clustering similar strings.
This works best if there is a number of underlying categories that
sometimes appear in the data with small variations and/or misspellings.
Parameters
----------
X : sequence of str
The data to be deduplicated.
n_clusters : int, default=None
Number of clusters to use for hierarchical clustering, if `None` use the
number of clusters that lead to the lowest silhouette score.
ngram_range : 2-tuple of int, default=(2, 4)
The lower and upper boundaries of the range of n-values for different
n-grams used in the string similarity. All values of `n` such
that ``min_n <= n <= max_n`` will be used.
analyzer : {'word', 'char', 'char_wb'}, default='char_wb'
Analyzer parameter for the CountVectorizer
used for the string similarities.
Describes whether the matrix `V` to factorize should be made of
word counts or character n-gram counts.
Option `char_wb` creates character n-grams only from text inside word
boundaries; n-grams at the edges of words are padded with space.
linkage_method : {'single', 'complete', 'average', 'centroid', 'median', 'ward'},
default='average'
Linkage method parameter to use for merging clusters via
:func:`scipy.cluster.hierarchy.linkage`.
Option 'average' calculates the distance between two clusters as the
average distance between data points in the first and second cluster.
n_jobs : int, default=None
The number of jobs to run in parallel.
Returns
-------
list of str
The deduplicated data.
See Also
--------
GapEncoder :
Encodes dirty categories (strings) by constructing latent topics with
continuous encoding.
MinHashEncoder :
Encode string columns as a numeric array with the minhash method.
SimilarityEncoder :
Encode string columns as a numeric array with n-gram string similarity.
Notes
-----
Deduplication is done by first computing the n-gram distance between unique
categories in data, then performing hierarchical clustering on this distance
matrix, and choosing the most frequent element in each cluster as the
'correct' spelling. This method works best if the true number of
categories is significantly smaller than the number of observed spellings.
Examples
--------
>>> from skrub.datasets import make_deduplication_data
>>> duplicated = make_deduplication_data(examples=['black', 'white'],
... entries_per_example=[5, 5],
... prob_mistake_per_letter=0.3,
... random_state=42)
>>> duplicated
['blacs', 'black', 'black', 'black', 'black', \
'uhibe', 'white', 'white', 'white', 'white']
To deduplicate the data, we can build a correspondence matrix:
>>> deduplicate_correspondence = deduplicate(duplicated)
>>> deduplicate_correspondence
blacs black
black black
black black
black black
black black
uhibe white
white white
white white
white white
white white
dtype: object
The translation table above is actually a series, giving the deduplicated values,
and indexed by the original values.
A deduplicated version of the initial list can easily be created:
>>> deduplicated = list(deduplicate_correspondence)
>>> deduplicated
['black', 'black', 'black', 'black', 'black', \
'white', 'white', 'white', 'white', 'white']
"""
unique_words, counts = np.unique(X, return_counts=True)
distance_mat = compute_ngram_distance(
unique_words, ngram_range=ngram_range, analyzer=analyzer
)
Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True)
if n_clusters is None:
n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
clusters = fcluster(Z, n_clusters, criterion="maxclust")
translation_table = _create_spelling_correction(unique_words, counts, clusters)
unrolled_corrections = translation_table[X]
return unrolled_corrections