Skip to content

Commit

Permalink
test(eda): add test for config
Browse files Browse the repository at this point in the history
  • Loading branch information
Waterpine committed Jul 20, 2021
1 parent d6a6af6 commit ab3172f
Showing 1 changed file with 46 additions and 35 deletions.
81 changes: 46 additions & 35 deletions dataprep/tests/eda/test_config.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,50 @@
"""
This module for testing config parameter
"""
import dask.dataframe as dd
import pandas as pd
import numpy as np
import random
import pytest

# from ...eda.basic.configs import BarChart, Histogram, WordCloud
#
#
# def test_hist() -> None:
# dict_data = {"bins": 20, "agg": "mean", "value_range": [0.1, 0.5]}
# histogram = Histogram()
# histogram = histogram.from_dict(dict_data)
# for key in dict_data.keys():
# assert histogram.__dict__[key] == dict_data[key]
#
#
# def test_bar() -> None:
# dict_data = {
# "ngroups": 20,
# "largest": False,
# "nsubgroups": 10,
# "top_words": 50,
# "stopword": True,
# "lemmatize": False,
# "stem": False,
# "sort_by": "alphabet",
# "sort_ascending": False,
# }
# barchart = BarChart()
# barchart = barchart.from_dict(dict_data)
# for key in dict_data.keys():
# assert barchart.__dict__[key] == dict_data[key]
#
#
# def test_word() -> None:
# dict_data = {"top_words": 20, "stopword": False, "lemmatize": False, "stem": False}
# wordcloud = WordCloud()
# wordcloud = wordcloud.from_dict(dict_data)
# for key in dict_data.keys():
# assert wordcloud.__dict__[key] == dict_data[key]
from ...eda import plot, plot_correlation, plot_missing
from ...eda.utils import to_dask


@pytest.fixture(scope="module")
def simpledf() -> dd.DataFrame:
df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"])
df = pd.concat([df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1)
df = pd.concat([df, pd.Series(np.random.choice(["a", "d"], 1000, replace=True))], axis=1)
df.columns = ["a", "b", "c", "d", "e"]
idx = np.arange(1000)
np.random.shuffle(idx)
df.iloc[idx[:500], 0] = None
df = to_dask(df)
return df


def test_sanity_compute_1(simpledf: dd.DataFrame) -> None:
for _ in range(5):
hist_bins = random.randint(20, 50)
bar_bars = random.randint(20, 50)
kde_bins = random.randint(20, 50)
wordfreq_top_words = random.randint(20, 50)
heatmap_ngroups = random.randint(20, 50)
plot(simpledf, config={"hist.bins": hist_bins, "hist.yscale": "log"})
plot(simpledf, config={"bar.bars": bar_bars, "bar.yscale": "log", "bar.color": "#123456"})
plot(simpledf, config={"kde.bins": kde_bins, "kde.yscale": "log"})
plot(simpledf, config={"wordfreq.top_words": wordfreq_top_words})
plot(simpledf, config={"heatmap.ngroups": heatmap_ngroups})


def test_sanity_compute_2(simpledf: dd.DataFrame) -> None:
for _ in range(5):
spectrum_bins = random.randint(5, 20)
plot_missing(simpledf, config={"spectrum.bins": spectrum_bins})


def test_sanity_compute_3(simpledf: dd.DataFrame) -> None:
for _ in range(5):
sample_size = random.randint(200, 1000)
plot_correlation(simpledf, "a", "b", config={"scatter.sample_size": sample_size})

0 comments on commit ab3172f

Please sign in to comment.