Skip to content

Commit

Permalink
fix(eda.create_report): univar datetime analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Lockhart committed Nov 30, 2020
1 parent 45d0770 commit 4632852
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 7 deletions.
22 changes: 18 additions & 4 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
from ..correlation.compute.nullivariate import correlation_nxn
from ..data_array import DataArray
from ..distribution import render
from ..distribution.compute.common import _calc_line_dt
from ..distribution.compute.overview import calc_stats
from ..distribution.compute.univariate import cont_comps, nom_comps
from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats
from ..distribution.compute.univariate import cont_comps, nom_comps, calc_stats_dt
from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats, stats_viz_dt
from ..dtypes import (
CATEGORICAL_DTYPES,
Continuous,
DateTime,
Nominal,
detect_dtype,
is_dtype,
Expand Down Expand Up @@ -111,14 +113,21 @@ def format_basic(df: dd.DataFrame) -> Dict[str, Any]:
stats: Any = None # needed for pylint
if is_dtype(detect_dtype(df[col]), Continuous()):
itmdt = Intermediate(col=col, data=data[col], visual_type="numerical_column")
rndrd = render(itmdt, plot_height_lrg=250, plot_width_lrg=280)["layout"]
stats = format_num_stats(data[col])
elif is_dtype(detect_dtype(df[col]), Nominal()):
itmdt = Intermediate(col=col, data=data[col], visual_type="categorical_column")
rndrd = render(itmdt, plot_height_lrg=250, plot_width_lrg=280)["layout"]
stats = format_cat_stats(
data[col]["stats"], data[col]["len_stats"], data[col]["letter_stats"]
)
elif is_dtype(detect_dtype(df[col]), DateTime()):
itmdt = Intermediate(
col=col,
data=data[col]["stats"],
line=data[col]["line"],
visual_type="datetime_column",
)
stats = stats_viz_dt(data[col]["stats"])
rndrd = render(itmdt, plot_height_lrg=250, plot_width_lrg=280)["layout"]
figs: List[Figure] = []
for tab in rndrd:
try:
Expand Down Expand Up @@ -214,6 +223,11 @@ def basic_computations(df: dd.DataFrame) -> Tuple[Dict[str, Any], Dict[str, Any]
data[col] = nom_comps(
df.frame[col], first_rows[col], 10, True, 10, 20, True, False, False
)
elif is_dtype(detect_dtype(df.frame[col]), DateTime()):
data[col] = {}
data[col]["stats"] = calc_stats_dt(df.frame[col])
data[col]["line"] = dask.delayed(_calc_line_dt)(df.frame[[col]], "auto")

# overview
data["ov"] = calc_stats(df.frame, None)
# interactions
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,15 +451,15 @@ def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
Dict[str, str]
Dictionary that contains Overview
"""
size = len(srs) # include nan
size = srs.shape[0] # include nan
count = srs.count() # exclude nan
uniq_count = srs.nunique()
overview_dict = {
"Distinct Count": uniq_count,
"Unique (%)": uniq_count / count,
"Missing": size - count,
"Missing (%)": 1 - (count / size),
"Memory Size": srs.memory_usage(),
"Memory Size": srs.memory_usage(deep=True),
"Minimum": srs.min(),
"Maximum": srs.max(),
}
Expand Down
2 changes: 1 addition & 1 deletion dataprep/tests/eda/test_create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def simpledf() -> pd.DataFrame:
)
# df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1)
df.columns = ["a", "b", "c", "d", "e", "f"]
# df["e"] = pd.to_datetime(df["e"])
df["g"] = pd.to_datetime(df["f"])

idx = np.arange(1000)
np.random.shuffle(idx)
Expand Down

0 comments on commit 4632852

Please sign in to comment.