Skip to content

Commit

Permalink
perf(eda): use approximate nunique
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhenmao authored and jinglinpeng committed Apr 12, 2021
1 parent 647a987 commit 6030064
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
2 changes: 1 addition & 1 deletion dataprep/eda/distribution/compute/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
data["norm"] = normaltest(data["hist"][0])
data["skew"] = skewtest(data["hist"][0])
data["nneg"] = (srs < 0).sum() # number of negative values
data["nuniq"] = srs.nunique() # number of unique values
data["nuniq"] = srs.nunique_approx() # number of unique values
data["nzero"] = (srs == 0).sum() # number of zeros

return data
Expand Down
10 changes: 7 additions & 3 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
if cfg.stats.enable:
data["min"] = srs.min()
data["max"] = srs.max()
data["nuniq"] = srs.nunique()
data["nuniq"] = srs.nunique_approx()
data["nreals"] = srs.shape[0]
data["nzero"] = (srs == 0).sum()
data["nneg"] = (srs < 0).sum()
Expand Down Expand Up @@ -353,10 +353,14 @@ def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
"""
size = srs.shape[0] # include nan
count = srs.count() # exclude nan
uniq_count = srs.nunique()
# nunique_approx() has error when type is datetime
try:
uniq_count = srs.nunique_approx()
except: # pylint: disable=W0702
uniq_count = srs.nunique()
overview_dict = {
"Distinct Count": uniq_count,
"Unique (%)": uniq_count / count,
"Approximate Unique (%)": uniq_count / count,
"Missing": size - count,
"Missing (%)": 1 - (count / size),
"Memory Size": srs.memory_usage(deep=True),
Expand Down
8 changes: 4 additions & 4 deletions dataprep/eda/distribution/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,8 +1248,8 @@ def format_num_stats(data: Dict[str, Any]) -> Dict[str, Dict[str, str]]:
Format numerical statistics
"""
overview = {
"Distinct Count": data["nuniq"],
"Unique (%)": data["nuniq"] / data["npres"],
"Approximate Distinct Count": data["nuniq"],
"Approximate Unique (%)": data["nuniq"] / data["npres"],
"Missing": data["nrows"] - data["npres"],
"Missing (%)": 1 - (data["npres"] / data["nrows"]),
"Infinite": (data["npres"] - data["nreals"]),
Expand Down Expand Up @@ -1301,8 +1301,8 @@ def format_cat_stats(
Format categorical statistics
"""
ov_stats = {
"Distinct Count": stats["nuniq"],
"Unique (%)": stats["nuniq"] / stats["npres"],
"Approximate Distinct Count": stats["nuniq"],
"Approximate Unique (%)": stats["nuniq"] / stats["npres"],
"Missing": stats["nrows"] - stats["npres"],
"Missing (%)": 1 - stats["npres"] / stats["nrows"],
"Memory Size": stats["mem_use"],
Expand Down

0 comments on commit 6030064

Please sign in to comment.