Skip to content

Commit

Permalink
docs(eda): update legacy documentations for eda
Browse files Browse the repository at this point in the history
Update the user guide, edit the readme, include four more datasets to
load_dataset(), fix a bug for plot(df,x,y)
  • Loading branch information
dylanzxc authored and dovahcrow committed Mar 4, 2021
1 parent 0fecad8 commit 8f948e0
Show file tree
Hide file tree
Showing 23 changed files with 132,543 additions and 8,159 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ DataPrep.EDA is the **_only_** task-centric EDA system in Python. It is carefull

- **Task-Centric API Design**: You can declaratively specify a wide range of EDA tasks in different granularities with a single function call. All needed visualizations will be automatically and intelligently generated for you.
- **Auto-Insights**: DataPrep.EDA automatically detects and highlights the insights (e.g., a column has many outliers) to facilitate pattern discovery about the data.
- **How-to Guide** (available soon): A how-to guide is provided to show the configuration of each plot function. With this feature, you can easily customize the generated visualizations.
- **[How-to Guide](https://sfu-db.github.io/dataprep/user_guide/eda/how_to_guide.html)** : A how-to guide is provided to show the configuration of each plot function. With this feature, you can easily customize the generated visualizations.

#### Understand the Titanic dataset with Task-Centric API:

Expand Down
2 changes: 1 addition & 1 deletion dataprep/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def load_dataset(name: str) -> pd.DataFrame:
Get all available dataset names:
>>> from dataprep.datasets import get_dataset_names
>>> get_dataset_names()
['iris', 'titanic']
['iris', 'titanic', 'adult', 'house_prices_train', 'house_prices_test']
"""

# Remove suffix 'csv' and transform to lower case
Expand Down
48,843 changes: 48,843 additions & 0 deletions dataprep/datasets/data/adult.csv

Large diffs are not rendered by default.

1,460 changes: 1,460 additions & 0 deletions dataprep/datasets/data/house_prices_test.csv

Large diffs are not rendered by default.

1,461 changes: 1,461 additions & 0 deletions dataprep/datasets/data/house_prices_train.csv

Large diffs are not rendered by default.

1,784 changes: 892 additions & 892 deletions dataprep/datasets/data/titanic.csv

Large diffs are not rendered by default.

1,600 changes: 1,600 additions & 0 deletions dataprep/datasets/data/wine-quality-red.csv

Large diffs are not rendered by default.

106 changes: 85 additions & 21 deletions dataprep/eda/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@
"CDF": "cdf",
}

# This dictionary map is used for session control in create_report
DISPLAY_REPORT_MAP = {
"Overview": "overview",
"Variables": "variables",
"Interactions": "interactions",
"Correlations": "correlations",
"Missing Values": "missingvalues",
}


class Plot(BaseModel):
"""
Expand All @@ -61,7 +70,6 @@ class Plot(BaseModel):
height: Union[int, None] = None
bins: Union[int, None] = None
ngroups: Union[int, None] = None
grid_column: int = 3
report: bool = False


Expand All @@ -79,32 +87,33 @@ class Insight(BaseModel):
enable: bool, default True
Whether to create this element
duplicates__threshold: int, default 1
The threshold for duplicated row counts
similar_distribution__threshold:int, default 0.05
Warn if the percent of duplicated values is above this threshold
similar_distribution__threshold:float, default 0.05
The significance level for Kolmogorov–Smirnov test
uniform__threshold: int, default 0.999
uniform__threshold: float, default 0.999
The p-value threshold for chi-square test
missing__threshold: int, default 1
The threshold for missing values count
skewed__threshold: int, default 1e-5
The threshold for skewness statistics
Warn if the percent of missing values is above this threshold
skewed__threshold: float, default 1e-5
The p-value for the scipy.skewtest which test whether the skew is
different from the normal distributionin
infinity__threshold: int, default 1
The threshold for infinity count
Warn if the percent of infinites is above this threshold
zeros__threshold: int, default 5
The threshold for zeros count
Warn if the percent of zeros is above this threshold
negatives__threshold: int, default 1
The threshold for negatives count
normal__threshold: int, default 0.99
The p-value threshold for normaltest, it is based on D’Agostino and Pearson’s test that
Warn if the percent of negatives is above this threshold
normal__threshold: float, default 0.99
The p-value threshold for normal test, it is based on D’Agostino and Pearson’s test that
combines skew and kurtosis to produce an omnibus test of normality
high_cardinality__threshold: int, default 50
The threshold for unique values count, count larger than threshold yields high cardinality
constant__threshold: int, default 1
The threshold for unique values count, count equals to threshold yields constant value
outstanding_no1__threshold: int, default 1.5
outstanding_no1__threshold: float, default 1.5
The threshold for outstanding no1 insight, measures the ratio of the largest category count
to the second-largest category count
attribution__threshold: int, default 0.5
attribution__threshold: float, default 0.5
The threshold for the attribution insight, measures the percentage of the top 2 categories
high_word_cardinality__threshold: int, default 1000
The threshold for the high word cardinality insight, which measures the number of words of
Expand All @@ -113,7 +122,7 @@ class Insight(BaseModel):
The threshold for the outstanding no1 word threshold, which measures the ratio of the most
frequent word count to the second most frequent word count
outlier__threshold: int, default 0
The threshold for the outlier count in the box plot, default 0
The threshold for the outlier count in the box plot
"""

# pylint: disable=too-many-instance-attributes
Expand Down Expand Up @@ -893,7 +902,7 @@ class PDF(BaseModel):
"""
enable: bool, default True
Whether to create this element
sample_size:
sample_size: int, default 100
Number of evenly spaced samples between the minimum and maximum values to compute the pdf at
height: int, default "auto"
Height of the plot
Expand Down Expand Up @@ -960,6 +969,52 @@ def _form(val: Any) -> Any:
return f"'{val}'" if isinstance(val, str) else val


# The following five classes are for create_report
class Overview(BaseModel):
"""
enable: bool, default True
Whether to create this element
"""

enable: bool = True


class Variables(BaseModel):
"""
enable: bool, default True
Whether to create this element
"""

enable: bool = True


class Interactions(BaseModel):
"""
enable: bool, default True
Whether to create this element
"""

enable: bool = True


class Correlations(BaseModel):
"""
enable: bool, default True
Whether to create this element
"""

enable: bool = True


class MissingValues(BaseModel):
"""
enable: bool, default True
Whether to create this element
"""

enable: bool = True


class Config(BaseModel):
"""
Configuration class
Expand Down Expand Up @@ -991,6 +1046,11 @@ class Config(BaseModel):
pdf: PDF = Field(default_factory=PDF)
cdf: CDF = Field(default_factory=CDF)
plot: Plot = Field(default_factory=Plot)
overview: Overview = Field(default_factory=Overview)
variables: Variables = Field(default_factory=Variables)
interactions: Interactions = Field(default_factory=Interactions)
correlations: Correlations = Field(default_factory=Correlations)
missingvalues: MissingValues = Field(default_factory=MissingValues)

@classmethod
def from_dict(
Expand All @@ -1000,12 +1060,16 @@ def from_dict(
Converts an dictionary instance into a config class
"""
cfg = cls()

if display:
display = [DISPLAY_MAP[disp] for disp in display]
# set all plots not in display list to enable=False except for Plot class
for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
setattr(getattr(cfg, plot), "enable", False)
try:
display = [DISPLAY_MAP[disp] for disp in display]
# set all plots not in display list to enable=False except for Plot class
for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
setattr(getattr(cfg, plot), "enable", False)
except KeyError:
display = [DISPLAY_REPORT_MAP[disp] for disp in display]
for plot in set(DISPLAY_REPORT_MAP.values()) - set(display):
setattr(getattr(cfg, plot), "enable", False)

if config:
# get the global parameters from config
Expand Down

0 comments on commit 8f948e0

Please sign in to comment.