docs(eda): update legacy documentations for eda

Update the user guide, edit the readme, include four more datasets to load_dataset(), fix a bug for plot(df,x,y)
sfu-db · Mar 4, 2021 · 8f948e0 · 8f948e0
1 parent 0fecad8
commit 8f948e0
Show file tree

Hide file tree

Showing 23 changed files with 132,543 additions and 8,159 deletions.
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ DataPrep.EDA is the **_only_** task-centric EDA system in Python. It is carefull
 
 - **Task-Centric API Design**: You can declaratively specify a wide range of EDA tasks in different granularities with a single function call. All needed visualizations will be automatically and intelligently generated for you.
 - **Auto-Insights**: DataPrep.EDA automatically detects and highlights the insights (e.g., a column has many outliers) to facilitate pattern discovery about the data.
-- **How-to Guide** (available soon): A how-to guide is provided to show the configuration of each plot function. With this feature, you can easily customize the generated visualizations.
+- **[How-to Guide](https://sfu-db.github.io/dataprep/user_guide/eda/how_to_guide.html)** : A how-to guide is provided to show the configuration of each plot function. With this feature, you can easily customize the generated visualizations.
 
 #### Understand the Titanic dataset with Task-Centric API:
 

diff --git a/dataprep/datasets/_base.py b/dataprep/datasets/_base.py
@@ -50,7 +50,7 @@ def load_dataset(name: str) -> pd.DataFrame:
     Get all available dataset names:
     >>> from dataprep.datasets import get_dataset_names
     >>> get_dataset_names()
-    ['iris', 'titanic']
+    ['iris', 'titanic', 'adult', 'house_prices_train', 'house_prices_test']
     """
 
     # Remove suffix 'csv' and transform to lower case

diff --git a/dataprep/datasets/data/adult.csv b/dataprep/datasets/data/adult.csv
diff --git a/dataprep/datasets/data/house_prices_test.csv b/dataprep/datasets/data/house_prices_test.csv
diff --git a/dataprep/datasets/data/house_prices_train.csv b/dataprep/datasets/data/house_prices_train.csv
diff --git a/dataprep/datasets/data/titanic.csv b/dataprep/datasets/data/titanic.csv
diff --git a/dataprep/datasets/data/wine-quality-red.csv b/dataprep/datasets/data/wine-quality-red.csv
diff --git a/dataprep/eda/configs.py b/dataprep/eda/configs.py
@@ -51,6 +51,15 @@
     "CDF": "cdf",
 }
 
+# This dictionary map is used for session control in create_report
+DISPLAY_REPORT_MAP = {
+    "Overview": "overview",
+    "Variables": "variables",
+    "Interactions": "interactions",
+    "Correlations": "correlations",
+    "Missing Values": "missingvalues",
+}
+
 
 class Plot(BaseModel):
     """
@@ -61,7 +70,6 @@ class Plot(BaseModel):
     height: Union[int, None] = None
     bins: Union[int, None] = None
     ngroups: Union[int, None] = None
-    grid_column: int = 3
     report: bool = False
 
 
@@ -79,32 +87,33 @@ class Insight(BaseModel):
     enable: bool, default True
         Whether to create this element
     duplicates__threshold: int, default 1
-        The threshold for duplicated row counts
-    similar_distribution__threshold:int, default 0.05
+        Warn if the percent of duplicated values is above this threshold
+    similar_distribution__threshold:float, default 0.05
         The significance level for Kolmogorov–Smirnov test
-    uniform__threshold: int, default 0.999
+    uniform__threshold: float, default 0.999
         The p-value threshold for chi-square test
     missing__threshold: int, default 1
-        The threshold for missing values count
-    skewed__threshold: int, default 1e-5
-        The threshold for skewness statistics
+         Warn if the percent of missing values is above this threshold
+    skewed__threshold: float, default 1e-5
+        The p-value for the scipy.skewtest which test whether the skew is
+        different from the normal distributionin
     infinity__threshold: int, default 1
-        The threshold for infinity count
+         Warn if the percent of infinites is above this threshold
     zeros__threshold: int, default 5
-        The threshold for zeros count
+         Warn if the percent of zeros is above this threshold
     negatives__threshold: int, default 1
-        The threshold for negatives count
-    normal__threshold: int, default 0.99
-        The p-value threshold for normaltest, it is based on D’Agostino and Pearson’s test that
+         Warn if the percent of negatives is above this threshold
+    normal__threshold: float, default 0.99
+        The p-value threshold for normal test, it is based on D’Agostino and Pearson’s test that
         combines skew and kurtosis to produce an omnibus test of normality
     high_cardinality__threshold: int, default 50
         The threshold for unique values count, count larger than threshold yields high cardinality
     constant__threshold: int, default 1
         The threshold for unique values count, count equals to threshold yields constant value
-    outstanding_no1__threshold: int, default 1.5
+    outstanding_no1__threshold: float, default 1.5
         The threshold for outstanding no1 insight, measures the ratio of the largest category count
         to the second-largest category count
-    attribution__threshold: int, default 0.5
+    attribution__threshold: float, default 0.5
         The threshold for the attribution insight, measures the percentage of the top 2 categories
     high_word_cardinality__threshold: int, default 1000
         The threshold for the high word cardinality insight, which measures the number of words of
@@ -113,7 +122,7 @@ class Insight(BaseModel):
         The threshold for the outstanding no1 word threshold, which measures the ratio of the most
         frequent word count to the second most frequent word count
     outlier__threshold: int, default 0
-        The threshold for the outlier count in the box plot, default 0
+        The threshold for the outlier count in the box plot
     """
 
     # pylint: disable=too-many-instance-attributes
@@ -893,7 +902,7 @@ class PDF(BaseModel):
     """
     enable: bool, default True
         Whether to create this element
-    sample_size:
+    sample_size: int, default 100
         Number of evenly spaced samples between the minimum and maximum values to compute the pdf at
     height: int, default "auto"
         Height of the plot
@@ -960,6 +969,52 @@ def _form(val: Any) -> Any:
     return f"'{val}'" if isinstance(val, str) else val
 
 
+# The following five classes are for create_report
+class Overview(BaseModel):
+    """
+    enable: bool, default True
+        Whether to create this element
+    """
+
+    enable: bool = True
+
+
+class Variables(BaseModel):
+    """
+    enable: bool, default True
+        Whether to create this element
+    """
+
+    enable: bool = True
+
+
+class Interactions(BaseModel):
+    """
+    enable: bool, default True
+        Whether to create this element
+    """
+
+    enable: bool = True
+
+
+class Correlations(BaseModel):
+    """
+    enable: bool, default True
+        Whether to create this element
+    """
+
+    enable: bool = True
+
+
+class MissingValues(BaseModel):
+    """
+    enable: bool, default True
+        Whether to create this element
+    """
+
+    enable: bool = True
+
+
 class Config(BaseModel):
     """
     Configuration class
@@ -991,6 +1046,11 @@ class Config(BaseModel):
     pdf: PDF = Field(default_factory=PDF)
     cdf: CDF = Field(default_factory=CDF)
     plot: Plot = Field(default_factory=Plot)
+    overview: Overview = Field(default_factory=Overview)
+    variables: Variables = Field(default_factory=Variables)
+    interactions: Interactions = Field(default_factory=Interactions)
+    correlations: Correlations = Field(default_factory=Correlations)
+    missingvalues: MissingValues = Field(default_factory=MissingValues)
 
     @classmethod
     def from_dict(
@@ -1000,12 +1060,16 @@ def from_dict(
         Converts an dictionary instance into a config class
         """
         cfg = cls()
-
         if display:
-            display = [DISPLAY_MAP[disp] for disp in display]
-            # set all plots not in display list to enable=False except for Plot class
-            for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
-                setattr(getattr(cfg, plot), "enable", False)
+            try:
+                display = [DISPLAY_MAP[disp] for disp in display]
+                # set all plots not in display list to enable=False except for Plot class
+                for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
+                    setattr(getattr(cfg, plot), "enable", False)
+            except KeyError:
+                display = [DISPLAY_REPORT_MAP[disp] for disp in display]
+                for plot in set(DISPLAY_REPORT_MAP.values()) - set(display):
+                    setattr(getattr(cfg, plot), "enable", False)
 
         if config:
             # get the global parameters from config