test(eda): add test for config

sfu-db · Jul 20, 2021 · ab3172f · ab3172f
1 parent d6a6af6
commit ab3172f
Showing 1 changed file with 46 additions and 35 deletions.
diff --git a/dataprep/tests/eda/test_config.py b/dataprep/tests/eda/test_config.py
@@ -1,39 +1,50 @@
 """
     This module for testing config parameter
 """
+import dask.dataframe as dd
+import pandas as pd
+import numpy as np
+import random
+import pytest
 
-# from ...eda.basic.configs import BarChart, Histogram, WordCloud
-#
-#
-# def test_hist() -> None:
-#     dict_data = {"bins": 20, "agg": "mean", "value_range": [0.1, 0.5]}
-#     histogram = Histogram()
-#     histogram = histogram.from_dict(dict_data)
-#     for key in dict_data.keys():
-#         assert histogram.__dict__[key] == dict_data[key]
-#
-#
-# def test_bar() -> None:
-#     dict_data = {
-#         "ngroups": 20,
-#         "largest": False,
-#         "nsubgroups": 10,
-#         "top_words": 50,
-#         "stopword": True,
-#         "lemmatize": False,
-#         "stem": False,
-#         "sort_by": "alphabet",
-#         "sort_ascending": False,
-#     }
-#     barchart = BarChart()
-#     barchart = barchart.from_dict(dict_data)
-#     for key in dict_data.keys():
-#         assert barchart.__dict__[key] == dict_data[key]
-#
-#
-# def test_word() -> None:
-#     dict_data = {"top_words": 20, "stopword": False, "lemmatize": False, "stem": False}
-#     wordcloud = WordCloud()
-#     wordcloud = wordcloud.from_dict(dict_data)
-#     for key in dict_data.keys():
-#         assert wordcloud.__dict__[key] == dict_data[key]
+from ...eda import plot, plot_correlation, plot_missing
+from ...eda.utils import to_dask
+
+
+@pytest.fixture(scope="module")
+def simpledf() -> dd.DataFrame:
+    df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"])
+    df = pd.concat([df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1)
+    df = pd.concat([df, pd.Series(np.random.choice(["a", "d"], 1000, replace=True))], axis=1)
+    df.columns = ["a", "b", "c", "d", "e"]
+    idx = np.arange(1000)
+    np.random.shuffle(idx)
+    df.iloc[idx[:500], 0] = None
+    df = to_dask(df)
+    return df
+
+
+def test_sanity_compute_1(simpledf: dd.DataFrame) -> None:
+    for _ in range(5):
+        hist_bins = random.randint(20, 50)
+        bar_bars = random.randint(20, 50)
+        kde_bins = random.randint(20, 50)
+        wordfreq_top_words = random.randint(20, 50)
+        heatmap_ngroups = random.randint(20, 50)
+        plot(simpledf, config={"hist.bins": hist_bins, "hist.yscale": "log"})
+        plot(simpledf, config={"bar.bars": bar_bars, "bar.yscale": "log", "bar.color": "#123456"})
+        plot(simpledf, config={"kde.bins": kde_bins, "kde.yscale": "log"})
+        plot(simpledf, config={"wordfreq.top_words": wordfreq_top_words})
+        plot(simpledf, config={"heatmap.ngroups": heatmap_ngroups})
+
+
+def test_sanity_compute_2(simpledf: dd.DataFrame) -> None:
+    for _ in range(5):
+        spectrum_bins = random.randint(5, 20)
+        plot_missing(simpledf, config={"spectrum.bins": spectrum_bins})
+
+
+def test_sanity_compute_3(simpledf: dd.DataFrame) -> None:
+    for _ in range(5):
+        sample_size = random.randint(200, 1000)
+        plot_correlation(simpledf, "a", "b", config={"scatter.sample_size": sample_size})