Skip to content

Commit

Permalink
fix(eda): fix column name and index related issues
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Apr 7, 2021
1 parent e83f95b commit 40a89b9
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 13 deletions.
1 change: 1 addition & 0 deletions dataprep/eda/correlation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def plot_correlation(
This function only supports numerical or categorical data,
and it is better to drop None, Nan and Null value before using it
"""

cfg = Config.from_dict(display, config)

with ProgressBar(minimum=1, disable=not progress):
Expand Down
2 changes: 2 additions & 0 deletions dataprep/eda/correlation/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ...configs import Config
from ...data_array import DataArray, DataFrame
from ...intermediate import Intermediate
from ...utils import preprocess_dataframe
from .bivariate import _calc_bivariate
from .overview import _calc_overview
from .univariate import _calc_univariate
Expand Down Expand Up @@ -55,6 +56,7 @@ def compute_correlation(
elif not cfg:
cfg = Config()

df = preprocess_dataframe(df)
if x is None and y is None: # pylint: disable=no-else-return
with catch_warnings():
filterwarnings(
Expand Down
6 changes: 2 additions & 4 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,12 @@
Nominal,
detect_dtype,
is_dtype,
string_dtype_to_object,
)
from ..intermediate import Intermediate
from ..missing import render_missing
from ..missing.compute.nullivariate import compute_missing_nullivariate
from ..progress_bar import ProgressBar
from ..utils import to_dask
from ..utils import preprocess_dataframe


def format_report(
Expand Down Expand Up @@ -70,8 +69,7 @@ def format_report(
This variable acts like an API in passing data to the template engine.
"""
with ProgressBar(minimum=1, disable=not progress):
df = to_dask(df)
df = string_dtype_to_object(df)
df = preprocess_dataframe(df)
if mode == "basic":
comps = format_basic(df, cfg)
# elif mode == "full":
Expand Down
6 changes: 2 additions & 4 deletions dataprep/eda/distribution/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ...configs import Config
from ...dtypes import DTypeDef, string_dtype_to_object
from ...intermediate import Intermediate
from ...utils import to_dask
from ...utils import preprocess_dataframe
from .bivariate import compute_bivariate
from .overview import compute_overview
from .trivariate import compute_trivariate
Expand Down Expand Up @@ -59,9 +59,7 @@ def compute(
"""
# pylint: disable=too-many-arguments

df = to_dask(df)
df.columns = df.columns.astype(str)
df = string_dtype_to_object(df)
df = preprocess_dataframe(df)

if isinstance(cfg, dict):
cfg = Config.from_dict(display, cfg)
Expand Down
5 changes: 3 additions & 2 deletions dataprep/eda/missing/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from ...configs import Config
from ...data_array import DataArray, DataFrame
from ...dtypes import DTypeDef, string_dtype_to_object
from ...dtypes import DTypeDef
from ...utils import preprocess_dataframe
from ...intermediate import Intermediate
from .bivariate import compute_missing_bivariate
from .nullivariate import compute_missing_nullivariate
Expand Down Expand Up @@ -61,7 +62,7 @@ def compute_missing(
>>> plot_missing(df, "HDI_for_year")
>>> plot_missing(df, "HDI_for_year", "population")
"""
df = string_dtype_to_object(df)
df = preprocess_dataframe(df)
df = DataArray(df)

# pylint: disable=no-else-raise
Expand Down
55 changes: 55 additions & 0 deletions dataprep/eda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import logging
from math import ceil
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from collections import Counter


import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from bokeh.models import Legend, FuncTickFormatter
from bokeh.plotting import Figure
from scipy.stats import gaussian_kde as gaussian_kde_
Expand All @@ -34,6 +37,58 @@ def to_dask(df: Union[pd.DataFrame, dd.DataFrame]) -> dd.DataFrame:
return dd.from_pandas(df, npartitions=npartitions)


def preprocess_dataframe(
org_df: Union[pd.DataFrame, dd.DataFrame],
used_columns: Optional[Union[List[str], List[object]]] = None,
) -> dd.DataFrame:
"""
Make a dask dataframe with only used_columns.
This function will do the following:
1. keep only used_columns.
2. transform column name to string (avoid object column name) and rename
duplicate column names in form of {col}_{id}.
3. reset index
4. transform object column to string column (note that obj column can contain
cells from different type).
5. transform to dask dataframe if input is pandas dataframe.
"""
if used_columns is None:
df = org_df.copy()
else:
# Process the case when used_columns are string column name,
# but org_df column name is object.
used_columns_set = set(used_columns)
used_cols_obj = set()
for col in org_df.columns:
if str(col) in used_columns_set or col in used_columns_set:
used_cols_obj.add(col)
df = org_df[used_cols_obj]

columns = list(df.columns)

# Resolve duplicate names in columns.
# Duplicate names will be renamed as col_{id}.
column_count = Counter(columns)
current_id: Dict[Any, int] = dict()
for i, col in enumerate(columns):
if column_count[col] > 1:
current_id[col] = current_id.get(col, 0) + 1
new_col_name = f"{col}_{current_id[col]}"
else:
new_col_name = f"{col}"
columns[i] = new_col_name

df.columns = columns
df = df.reset_index(drop=True)

# Since an object column could contains multiple types
# in different cells. transform object column to string.
for col in df.columns:
if is_object_dtype(df[col].dtype):
df[col] = df[col].astype(str)
return to_dask(df)


def sample_n(arr: np.ndarray, n: int) -> np.ndarray: # pylint: disable=C0103
"""Sample n values uniformly from the range of the `arr`,
not from the distribution of `arr`'s elems."""
Expand Down
23 changes: 20 additions & 3 deletions dataprep/tests/eda/random_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def gen_random_series(
size: int,
dtype: str = "object",
na_ratio: float = 0.0,
str_max_len: int = 100,
random_state: Union[int, np.random.RandomState] = 0,
) -> pd.Series:
"""
Expand All @@ -97,6 +98,8 @@ def gen_random_series(
Chosen from 'int', 'float', 'boolean', 'datetime', 'string' and 'object'.
na_ratio: float
The ratio of NA values in the series. Should be in [0.0, 1.0]
str_max_len: int
The max len of random string
seed: int
generator seed
"""
Expand All @@ -118,7 +121,10 @@ def gen_random_series(
population_list = []
for curr_type in gen_func:
if dtype in [curr_type, "object"]:
rand_series = gen_func[curr_type](size, random_state=rand)
if curr_type != "string":
rand_series = gen_func[curr_type](size, random_state=rand)
else:
rand_series = gen_func[curr_type](size, max_len=str_max_len, random_state=rand)
population_list.append(rand_series)
object_population = pd.concat(population_list, ignore_index=True)
object_series = pd.Series(rand.choice(object_population, size=size))
Expand All @@ -134,6 +140,7 @@ def gen_random_dataframe(
nrows: int = 30,
ncols: int = 30,
na_ratio: float = 0.0,
str_col_name_max_len: int = 100,
random_state: Union[int, np.random.RandomState] = 0,
) -> pd.DataFrame:
"""
Expand All @@ -148,6 +155,8 @@ def gen_random_dataframe(
Number of rows of the generated dataframe.
na_ratio:
Ratio of NA values.
str_col_name_max_len:
max length of string column name
ncols: int
Number of columns of the generated dataframe.
seed: int
Expand All @@ -166,7 +175,15 @@ def gen_random_dataframe(
df = pd.DataFrame(series_list)

# Generate random column names and index.
col_names = gen_random_series(size=ncols, dtype="object", na_ratio=0.1, random_state=rand)
col_names = gen_random_series(
size=ncols,
dtype="object",
na_ratio=0.1,
str_max_len=str_col_name_max_len,
random_state=rand,
)
df.columns = col_names
df.index = gen_random_series(df.index.shape[0], na_ratio=0.1, random_state=rand)
df.index = gen_random_series(
df.index.shape[0], na_ratio=0.1, str_max_len=str_col_name_max_len, random_state=rand
)
return df

0 comments on commit 40a89b9

Please sign in to comment.