Skip to content

Commit

Permalink
fix(eda): fix bugs in log transformation
Browse files Browse the repository at this point in the history
  • Loading branch information
Waterpine committed Apr 17, 2021
1 parent 42d0ae9 commit 209d7d0
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
33 changes: 25 additions & 8 deletions dataprep/eda/distribution/render.py
Expand Up @@ -339,14 +339,31 @@ def hist_viz(
)

tooltips = [("Bin", "@intvl"), ("Frequency", "@freq"), ("Percent", "@pct{0.2f}%")]
fig = Figure(
plot_height=plot_height,
plot_width=plot_width,
title=col,
toolbar_location=None,
y_axis_type=yscale,
)
bottom = 0 if yscale == "linear" or df.empty else df["freq"].min() / 2
if yscale == "linear" or df.empty:
bottom = 0.0
elif yscale == "log" and df["freq"].min() == 0: # freq >= 1 so we set 0.1 as lower bound
bottom = 0.1
else:
bottom = df["freq"].min() / 2

if yscale == "linear":
fig = Figure(
plot_height=plot_height,
plot_width=plot_width,
title=col,
toolbar_location=None,
y_axis_type=yscale,
)
else:
fig = Figure(
plot_height=plot_height,
plot_width=plot_width,
title=col,
toolbar_location=None,
y_axis_type=yscale,
y_range=(bottom, df["freq"].max()),
)

fig.quad(
source=df,
left="left",
Expand Down
2 changes: 2 additions & 0 deletions dataprep/tests/eda/test_plot.py
Expand Up @@ -35,6 +35,7 @@ def simpledf() -> dd.DataFrame:
df["e"] = pd.to_datetime(df["e"])
# test when column is object but some cells are numerical
df["g"] = pd.Series([0, "x"] * 500)
df["h"] = pd.Series(np.ones(1000))

idx = np.arange(1000)
np.random.shuffle(idx)
Expand Down Expand Up @@ -62,6 +63,7 @@ def test_sanity_compute_univariate(simpledf: dd.DataFrame) -> None:

def test_sanity_compute_overview(simpledf: dd.DataFrame) -> None:
plot(simpledf)
plot(simpledf, config={"hist.yscale": "log"})


def test_sanity_compute_bivariate(simpledf: dd.DataFrame) -> None:
Expand Down

0 comments on commit 209d7d0

Please sign in to comment.