In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from utils import process_viztrace_json

In [3]:
benchmark_dict = {}
benchmark_endtimes = {}
n_runs = 20
for benchmark_name in [
    "tsfresh_sequential",
    "tsfresh_mp",
    "seglearn",
    "tsfel_sequential",
    "tsfel_mp",
    "tsflex_sequential",
    "tsflex_mp",
]:
    df_cpu, df_mem = None, None
    for i in range(n_runs):
        df_new_cpu, df_new_mem = process_viztrace_json(
            f"benchmark_jsons/{benchmark_name}_{i}.json"
        )

        if benchmark_name not in benchmark_endtimes:
            benchmark_endtimes[benchmark_name] = [df_new_mem.index[-1]]
        else:
            # print('append')
            benchmark_endtimes[benchmark_name].append(df_new_mem.index[-1])

        if df_cpu is None:
            df_cpu = df_new_cpu
        else:
            df_new_cpu["args.cpu_percent"].rename(f"args.cpu_percent_{i}")
            dfs = list([df_cpu, df_new_cpu])
            dfs.sort(key=lambda x: x.index[-1], reverse=True)
            df_cpu = pd.merge_asof(
                dfs[0],
                dfs[1],
                left_index=True,
                right_index=True,
                tolerance=pd.Timedelta("100ms"),
            )

        if df_mem is None:
            df_mem = df_new_mem
        else:
            df_new_mem = df_new_mem.filter(like="args.").rename(
                columns={"args.rss": f"args.rss{i}", "args.vms": f"args.vms{i}"}
            )
            dfs = list([df_mem, df_new_mem])
            dfs.sort(key=lambda x: x.index[-1], reverse=True)
            df_mem = pd.merge_asof(
                dfs[0],
                dfs[1],
                left_index=True,
                right_index=True,
                tolerance=pd.Timedelta("100ms"),
            )

    # mem usage
    df_mem["mean_rss"] = df_mem.filter(like="args.rss").mean(axis=1)
    df_mem["std_rss"] = df_mem.filter(like="args.rss").std(axis=1)
    df_mem["max_rss"] = df_mem.filter(like="args.rss").max(axis=1)

    # mean cpu usage
    df_cpu["mean_usage"] = df_cpu.filter(like="args.cpu_percent").mean(axis=1)
    df_cpu["std"] = df_cpu.filter(like="args.cpu_percent").std(axis=1)

    benchmark_dict[benchmark_name] = [df_mem, df_cpu]

  df_cpu = pd.merge_asof(
  df_cpu = pd.merge_asof(


In [4]:
for configuration, (df_mem, df_cpu) in benchmark_dict.items():
    max_mem = df_mem.filter(like='args.rss').max()
    max_mem_mean = max_mem.mean()
    max_mem_std = max_mem.std()
    print(f"{configuration:<20}\t", round(max_mem_mean / 1e6, 1), "+/-", round(max_mem_std / 1e6, 1))

tsfresh_sequential  	 4040.6 +/- 2243.0
tsfresh_mp          	 4043.8 +/- 14.4
seglearn            	 435.3 +/- 1.5
tsfel_sequential    	 2.9 +/- 0.1
tsfel_mp            	 2.9 +/- 0.1
tsflex_sequential   	 1.2 +/- 0.1
tsflex_mp           	 1.3 +/- 0.1


In [5]:
for configuration, (df_mem, df_cpu) in benchmark_dict.items():
    nan_masks = df_mem.filter(like='args.rss').notna().values
    end_times = np.array([df_mem[nan_masks[:, i]].index[-1].total_seconds() for i in range(20)])
    end_time_mean = end_times.mean()
    end_time_std = end_times.std()
    print(f"{configuration:<20}", '\t', round(end_time_mean, 1), "+/-", round(end_time_std, 1))

tsfresh_sequential   	 167.8 +/- 12.1
tsfresh_mp           	 99.4 +/- 2.4
seglearn             	 9.5 +/- 0.5
tsfel_sequential     	 14.9 +/- 0.8
tsfel_mp             	 1.7 +/- 0.0
tsflex_sequential    	 4.1 +/- 0.1
tsflex_mp            	 0.7 +/- 0.0


In [6]:
# benchmark_dict["tsfel_sequential"][0].filter(like="args.rss").mean(
#     axis=1
# ) + benchmark_dict["tsfel_sequential"][0].filter(like="args.rss").rolling(
#     "1s"
# ).std().mean(
#     axis=1
# )

# np.max((benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').mean(axis=1) + benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').rolling('1s').std().mean(axis=1), benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').max(axis=1)), axis=0)

# benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').mean(axis=1).plot()
# (benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').mean(axis=1) +
# (benchmark_dict['tsfel_sequential'][0].filter(like='args.rss').mean(axis=1)

In [7]:
df_endtime = pd.DataFrame(benchmark_endtimes)
df_endtime= df_endtime.apply(lambda x: x.dt.total_seconds())
df_endtime.info()

# df_endtime.plot(kind='box', logy=True, figsize=(30, 10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tsfresh_sequential  20 non-null     float64
 1   tsfresh_mp          20 non-null     float64
 2   seglearn            20 non-null     float64
 3   tsfel_sequential    20 non-null     float64
 4   tsfel_mp            20 non-null     float64
 5   tsflex_sequential   20 non-null     float64
 6   tsflex_mp           20 non-null     float64
dtypes: float64(7)
memory usage: 1.2 KB


In [8]:
pd.DataFrame(benchmark_endtimes).describe().T.sort_values(by='mean')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tsflex_mp,20,0 days 00:00:00.602292004,0 days 00:00:00.033384340,0 days 00:00:00.549251754,0 days 00:00:00.571975953,0 days 00:00:00.594903754,0 days 00:00:00.613093018,0 days 00:00:00.669906652
tsfel_mp,20,0 days 00:00:01.660263948,0 days 00:00:00.039593228,0 days 00:00:01.603609586,0 days 00:00:01.626708392,0 days 00:00:01.654917294,0 days 00:00:01.691268790,0 days 00:00:01.743647473
tsflex_sequential,20,0 days 00:00:03.975466918,0 days 00:00:00.166150739,0 days 00:00:03.788163312,0 days 00:00:03.889922103,0 days 00:00:03.967451314,0 days 00:00:04.007876491,0 days 00:00:04.501543516
seglearn,20,0 days 00:00:09.364222149,0 days 00:00:00.514929612,0 days 00:00:09.031694152,0 days 00:00:09.132701652,0 days 00:00:09.197534895,0 days 00:00:09.280590795,0 days 00:00:11.288741096
tsfel_sequential,20,0 days 00:00:14.839587192,0 days 00:00:00.876230021,0 days 00:00:12.646683863,0 days 00:00:14.247700638,0 days 00:00:15.250473998,0 days 00:00:15.467680240,0 days 00:00:15.975508738
tsfresh_mp,20,0 days 00:01:39.308772079,0 days 00:00:02.486743827,0 days 00:01:35.956229607,0 days 00:01:37.613468141,0 days 00:01:39.220649958,0 days 00:01:40.765660068,0 days 00:01:45.264795967
tsfresh_sequential,20,0 days 00:02:47.744687639,0 days 00:00:12.470041607,0 days 00:01:55.700742031,0 days 00:02:48.440210144,0 days 00:02:50.673559181,0 days 00:02:51.474298922,0 days 00:02:54.789461963


In [None]:
y_tickvals = list(sum([[i * (10 ** pw) for i in [1, 3]] for pw in range(-1, 4)], []))
x_tickvals = list(sum([[i * (10 ** pw) for i in [1, 2, 5]] for pw in range(-2, 3)], []))

colors = [
    "#1f78b4",
    "#33a02c",
    #     "#fb9a99",
    #     "#e31a1c",
    #     "#fdbf6f",
    #     "#ff7f00",
    #     "#cab2d6",
    "#6a3d9a",
    #     "#ffff99",
    #     "#b15928",
    #     "#7fc97f",
    #     "#beaed4",
    #     "#fdc086",
    #     "#ffff99",
    #     "#386cb0",
    "#f0027f",
    "#bf5b17",
    "#666666",
]

fig = make_subplots(
    shared_xaxes=True,
    subplot_titles=[
        f"Strided-window feature extraction - averaged over {n_runs} runs"
    ],  # , "CPU usage"],
)


fig.update_layout(height=500)
fig.update_yaxes(type="log", tickvals=y_tickvals)
fig.update_yaxes(title_text="Memory usage (MB)")
fig.update_xaxes(
    type="log",
    tickvals=x_tickvals,
)  # range=[-1, 2.3], )
fig.update_xaxes(title_text="Runtime (s)")


# fig.update_yaxes(title_text="%", row=2, col=1)

library_list = []

color_idx = 0
for i, (benchmark_name, (df_mem, df_cpu)) in enumerate(benchmark_dict.items()):
    library = benchmark_name.split("_")[0]
    multiprocessing = "_mp" in benchmark_name.lower()

    first = False
    if library not in library_list:
        first = True
        library_list.append(library)

    color_idx = library_list.index(library)

    kwargs = {} if not multiprocessing else {"line_dash": "dash"}

    fig.add_trace(
        go.Scatter(
            x=df_mem.index.total_seconds(),
            y=df_mem["mean_rss"] / 1e6,
            name=benchmark_name,
            legendgroup=library,
            line_color=colors[color_idx],
            **kwargs,
        ),
        row=1,
        col=1,
    )

    color_str = colors[color_idx]
    rgb_vals = [str(int(color_str.lstrip("#")[i : i + 2], 16)) for i in (0, 2, 4)]
    fig.add_trace(
        go.Scatter(
            name="upper memory bound",
            x=df_mem.index.total_seconds(),
            y=df_mem["max_rss"] / 1e6,
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor=f"rgba({', '.join(rgb_vals)}, 0.1)",
            fill="tonexty",
            showlegend=False,  # first
            legendgroup=library,
        ),
        row=1,
        col=1,
    )


updatemenus = [
    dict(
        buttons=list(
            [
                dict(
                    args=[{"yaxis": {"type": "log", "title": "Memory usage (MB)"}}],
                    label="Y-axis: Log",
                    method="relayout",
                ),
                dict(
                    args=[{"yaxis": {"type": "linear", "title": "Memory usage (MB)"}}],
                    label="Y-axis: Linear",
                    method="relayout",
                ),
            ]
        ),
        direction="down",
        showactive=True,
        pad={"r": 10, "t": 10},
        yanchor="top",
        y=0.9,
    ),
    dict(
        buttons=list(
            [
                dict(
                    args=[{"xaxis": {"type": "log", "title": "Runtime (s)"}}],
                    label="X-axis: Log",
                    method="relayout",
                ),
                dict(
                    args=[{"xaxis": {"type": "linear", "title": "Runtime (s)"}}],
                    label="X-axis: Linear",
                    method="relayout",
                ),
            ]
        ),
        direction="down",
        showactive=True,
        pad={"r": 10, "t": 10},
        y=1.05,
        yanchor="top",
    ),
]


fig.update_layout(updatemenus=updatemenus)
fig.show()

In [10]:
# !pip3 install datapane

In [11]:
def figs_to_html(
    figs: List[go.Figure],
    html_path: Union[Path, str],
    append=False,
    include_plotlyjs=True,
):
    """Save a list of figures in a single HTML file.

    :param figs: A list of plotly figures
    :param html_path: the HTML path where the figure will be saved
    :param append:
    """
    if not isinstance(html_path, Path):
        html_path = Path(html_path)

    if not html_path.parent.exists():
        os.makedirs(html_path.parent)

    with open(html_path, "a" if append else "w") as f:
        for fig in figs:
            f.write(fig.to_html(full_html=False, include_plotlyjs=include_plotlyjs))


figs_to_html([fig], "benchmark.html", include_plotlyjs=True)

# Embed using datapane

In [12]:
#!pip3 install -U datapane

In [13]:
api_token = "<API TOKEN>"
!datapane login --token={api_token}

[32mConnected successfully to https://datapane.com as jonasvdd[0m


In [14]:
import datapane as dp 

report = dp.Report(dp.Plot(fig)) #Create a report

In [15]:
report.upload(name='tsflex_benchmark_v2') #Publish the report

Uploading report and associated data - *please wait...*

Your report only contains a single element - did you know you can include additional plots, tables and text in a single report? Check out [the docs](https://docs.datapane.com/reports/blocks/layout-pages-and-selects) for more info

Report successfully uploaded, click [here](https://datapane.com/u/jonasvdd/reports/dkjVy5k/tsflex-benchmark-v2/) to view and share your report.