In [1]:
import pandas as pd
import altair as alt

G = 1024 * 1024 * 1024
M = 1024 * 1024

In [49]:
order = [
    "heart",
    "diabetes",
    "automobile",
    "titanic",
    "women",
    "credit",
    "solar",
    "suicide",
    "diamonds",
    "chess",
    "adult",
    "women",
        "basketball",
    "conflicts",
    "rain",
    "hotel",
]

In [3]:
dataset_map = {
    "automobile": "automobile",
    "titanic": "titanic",
    "top_women_chess_players": "women",
    "suicide": "suicide",
    "chess": "chess",
    "adult": "adult",
    "asia_conflicts": "conflicts",
    "rain_australia": "rain",
    "hotel_bookings": "hotel",
    "poi_database": "poi",
    "historical_trades_cryptocurrency": "crypto",
    "car_ads": "ads",
    "nypd_arrests": "arrests",
    "commodity_trade": "trade",
    "yelp_academic_dataset_review": "yelp"
}

In [50]:
df = pd.read_json("results/dpvspp.1.json", lines=True)
df["DVM"] = df["mem_size"] / df["memory"]
df["MachineMem"] = df["memory"].apply(lambda value: f"{value / G:.0f}G")
df["DatasetMemSize"] = df["mem_size"].apply(lambda value: f"{value / M:.1f}M")
df["order"] = df["dataset"].apply(order.index)

gdf = df.groupby(["dataset", "name"]).elapsed.mean()
pivot = pd.pivot(gdf.reset_index(), index="dataset", columns="name", values="elapsed")
acc = pivot.join((gdf[:, "PandasProfilingReport"] / gdf[:, "DataPrepReport"] ).apply(lambda x: f"{x:.1f}x"))
acc["order"] = acc.index.to_series().apply(order.index)
acc.sort_values("order")

Unnamed: 0_level_0,DataPrepReport,PandasProfilingReport,elapsed,order
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
heart,1.994412,17.645971,8.8x,0
diabetes,1.563198,28.37663,18.2x,1
automobile,3.905547,38.671975,9.9x,2
titanic,2.122861,17.389683,8.2x,3
women,2.269493,20.37448,9.0x,4
credit,6.018265,127.283982,21.1x,5
solar,2.757923,24.80295,9.0x,6
suicide,2.86156,21.217045,7.4x,7
diamonds,3.063028,28.327528,9.2x,8
chess,4.367211,23.977657,5.5x,9


In [12]:
base = alt.Chart(df, title="create_report(df) Comparison", width=240)

(
    base.mark_bar().encode(
        y=alt.Y("name", title=""),
        x=alt.X("mean(elapsed)", title="Elapsed (s)"),
        color=alt.Color("name:N", legend=None),
        tooltip=[
            alt.Tooltip("name:N"),
            alt.Tooltip("elapsed:Q", title="Elapsed (s)", format=".2s"),
            alt.Tooltip("MachineMem"),
            alt.Tooltip("DatasetMemSize"),
        ],
    )
    + base.mark_errorbar(color="black", extent="stdev").encode(
        x=alt.X("elapsed", title="Elapsed (s)"),
        y=alt.Y("name:N", title=None),
    )
    + base.mark_text(color="white", dx=-15, dy=1).encode(
        y=alt.Y("name", title=""),
        x=alt.X("mean(elapsed)", title="Elapsed (s)"),
        text=alt.Text("mean(elapsed)", format=".1f"),
    )
).facet(
    alt.Facet(
        "dataset",
        title="Dataset",
        header=alt.Header(
            labelAngle=0, labelAlign="left", labelPadding=0, titlePadding=0
        ),
        sort=["order"],
    ),
    columns=2,
).resolve_scale(
    x="independent"
)