# ALI Performance Tests on Blake/Waterman
Performance comparison dashboard. Must be viewed in Jupyter to interact with plots.

In [None]:
import datetime as dt
import glob
import numpy as np
import pandas as pd
import json
import multiprocessing
import sys

import plotly.graph_objects as go

# Import scripts
sys.path.insert(0,'kcshan-perf-analysis')
from json2timeline import json2dataframe
from models import find_chgpts, single_ts_chgpts
from basicstats import *
from utils import *
from plotutils import *

In [None]:
# Load configuration file
with open('config.json') as jf:
    config = json.load(jf)
check_config(config)
for key,val in config.items():
        exec(key + '=val')

# Extract file names and collect data
files = glob.glob(json_regex)
all_df = json2dataframe(files, cases, names, timers, metadata)

# Log-transform the data before modeling
xform = lambda x: np.log(x)
inv_xform = lambda x: np.exp(x)

# Add other metrics to name list
names.append('max host memory')
names.append('max kokkos memory')
metadata.remove('max host memory')
metadata.remove('max kokkos memory')

# Filter data by date if desired
#import datetime as dt
#all_df = all_df[all_df['date'] < dt.datetime.strptime('20200301', '%Y%m%d')]
#all_df = all_df[all_df['date'] > dt.datetime.strptime('20200101', '%Y%m%d')]

In [None]:
#Print some information
print('Test cases:')
[print('  '+c) for c in cases]
print('Metrics:')
[print('  '+n) for n in names]
print('Metadata:')
[print('  '+m) for m in metadata]
print("Model threshold: %f" % threshold)

In [None]:
np.seterr(all='raise') 

# Find changepoints and format data to work nicely with plots
seqs = {case:{} for case in cases}
most_recent = all_df['date'].max()
events = {}
pool = multiprocessing.Pool(4)

print('Finding changepoints:')
for case in cases:
    print(case)
    for name in names:
        # Detect changepoints
        cols = ['date', name] + list(metadata)
        data = all_df.loc[all_df['case']==case, cols].dropna(subset=[name])
        data.reset_index(drop=True, inplace=True)
        data.rename(columns={name:'time'}, inplace=True)
        data['time'] = xform(data['time'])
        seqs[case][name] = data
    pool_inputs = [(k, v, threshold) for k,v in seqs[case].items()]
    chgpts = dict(pool.map(single_ts_chgpts, pool_inputs))
    
    for name in names:
        # Calculate mean/std between changepoints
        seqs[case][name] = add_regime_stats(seqs[case][name], chgpts[name])
        
        # Build dictionary of changepoints
        for d in seqs[case][name]['date'].iloc[chgpts[name]]:
            events.setdefault(d, {}).setdefault(case, []).append(name)
clear_output()

# Sort and print results
events = {k:events[k] for k in sorted(events.keys())}
print('Events in the most recent %d days:' % recency)
recent_events = print_events(events, most_recent, recency)

In [None]:
################################################################################
#                               PLOT BOTH SERIES                               #
################################################################################

lines = ['time', 'mean'] # 'upper', 'lower']
colors = ['darkred', 'midnightblue']
modes = ['markers', 'lines', 'lines', 'lines']
dashes = ['solid', 'solid', 'dot', 'dot']

fig = go.FigureWidget()
# Create series on plot
for line, mode, dash in zip(lines, modes, dashes):
    for c, color in zip(cases[:2], colors):
        first = c == cases[0]
        if line == 'time':
            fig.add_trace(go.Scatter(
                x=seqs[c][names[0]]['date'],
                y=inv_xform(seqs[c][names[0]][line]),
                mode=mode,
                line=dict(color=color, dash=dash, width=1.5),
                marker_symbol='square' if first else 'circle',
                name=c,
                legendgroup='g1' if first else 'g2',
                customdata=seqs[c][names[0]][['date']+list(metadata)],
                hovertemplate=
                "Date: %{customdata[0]}<br>" +
                "Albany commit: %{customdata[2]}<br>" +
                "Trilinos commit: %{customdata[3]}" +
                "<extra></extra>",
            ))
        else:
            fig.add_trace(go.Scatter(
                x=seqs[c][names[0]]['date'],
                y=inv_xform(seqs[c][names[0]][line]),
                mode=mode,
                line=dict(color=color, dash=dash, width=1.5),
                name=line,
                legendgroup='g1' if first else 'g2',
                hoverinfo='skip'
            ))

fig = fig.update_layout(
    title='Performance Tests (nightly data)',
    xaxis_title='Simulation Date',
    yaxis_title='Wall-clock Time (s) or Memory (MiB)',
    margin={'l': 30, 'r': 30, 'b': 30, 't': 30}
)


In [None]:
################################################################################
#                            PLOT PAIRED COMPARISON                            #
################################################################################
alpha = 0.01

def paired_data(c1, c2, name):
    df1, df2 = seqs[c1][name].set_index('date'), seqs[c2][name].set_index('date')
    df = df1.join(df2, lsuffix='_c1', rsuffix='_c2', how='inner')
    df.reset_index(inplace=True)
    df.rename(columns={'date':'date_c1'}, inplace=True)
    timediff = (df['time_c1'] - df['time_c2'])
    df = df[[c+'_c1' for c in ['date']+list(metadata)]]
    df.columns = ['date']+list(metadata)
    df['time'] = timediff 
    pts = find_chgpts(df['time'], alpha=threshold)[0]

    # Calculate mean/std between changepoints
    df = add_regime_stats(df, pts, std_error=True, alpha=alpha)
    return df

pair_df = paired_data(cases[0], cases[1], names[0])
lines_all = ['time', 'mean', 'upper', 'lower']
pair_color = 'rebeccapurple'

diff_fig = go.FigureWidget()
for line, mode, dash in zip(lines_all, modes, dashes):
    if line == 'time':
        diff_fig.add_trace(go.Scatter(
            x=pair_df['date'],
            y=inv_xform(pair_df[line]),
            mode=mode,
            line=dict(color=pair_color, dash=dash, width=1.5),
            marker_symbol='circle',
            name='Ratio',
            customdata=pair_df[['date']+list(metadata)],
            hovertemplate=
            "Date: %{customdata[0]}<br>" +
            "Albany commit: %{customdata[2]}<br>" +
            "Trilinos commit: %{customdata[3]}" +
            "<extra></extra>",
        ))
    else:
        diff_fig.add_trace(go.Scatter(
            x=pair_df['date'],
            y=inv_xform(pair_df[line]),
            mode=mode,
            line=dict(color=pair_color, dash=dash, width=1.5),
            name=line,
            hoverinfo='skip'
        ))

diff_fig = diff_fig.update_layout(
    shapes=hv_line('h',0),
    title='Performance Comparison',
    xaxis_title='Simulation Date',
    yaxis_title='Relative Performance (speedup, slowdown)',
    margin={'l': 30, 'r': 30, 'b': 30, 't': 30}
)

In [None]:
################################################################################
#                        HISTOGRAM OF PAIRED COMPARISON                        #
################################################################################
def latest_data(df):
    n = df.shape[0]
    pts = []
    for i in reversed(range(n)):
        if df['mean'][i] == df['mean'][n-1]:
            pts.append(df['time'][i])
        else:
            break
    return df['date'][i+1], np.array(pts)

# Create histogram of recent data from two test cases
hist = go.FigureWidget()
hist.add_trace(go.Histogram(
    x=inv_xform(latest_data(pair_df)[1]),
    name='Difference in ' + names[0],
    marker_color=pair_color
))
hist = hist.update_layout(
    shapes=hv_line('v',0),
    barmode='overlay',
    title='Histogram of the relative performance since latest changepoint',
    xaxis_title='Relative Performance (speedup, slowdown)',
    yaxis_title='Count',
    legend_orientation='h',
    legend=dict(x=0, y=1.11, bgcolor=None),
    margin={'l': 30, 'r': 30, 'b': 30, 't': 30}
)

In [None]:
################################################################################
#                              SUMMARY STATISTICS                              #
################################################################################
from ipywidgets import Output
# Create a textbox that will show t-test results, and update it
textbox = Output(layout={'border': '1px solid black', 'width': '40%'})
date2str = lambda date: dt.datetime.strftime(date, '%b %d')

def print_summary(x, indent=2, pct=False):
    N = len(x)
    mean, std = trimmed_stats(x, var=False)
    print(' '*indent, end='')
    if pct:
        print('N   : %d\n  mean: %.2f%%\n  std : %.2f%%' % (N, 100*mean, 100*std))
    else:
        print('N   : %d\n  mean: %.2f\n  std : %.2f' % (N, mean, std))

def update_textbox(c1=cases[0], c2=cases[1], n=names[0]):
    with textbox:
        textbox.clear_output()
        d1, x1 = latest_data(seqs[c1][n])
        d2, x2 = latest_data(seqs[c2][n])
        dp, xp = latest_data(paired_data(c1, c2, n))
        signif = lambda pval: '*'*(int(pval<0.05)+int(pval<0.01)+int(pval<0.001))
        
        # Summary statistics
        print('Data since latest changepoints')
        print('Baseline (since %s):\n  %s' % (date2str(d1), c1))
        print_summary(inv_xform(x1))
        print('Comparison (since %s):\n  %s' % (date2str(d2), c2))
        print_summary(inv_xform(x2))

        print('Paired observations (since %s):' % date2str(dp))
        print_summary(inv_xform(xp))
        if len(xp) > 2:
            _, paired_pval = ttest(xp, with_pval=True)
            mp, sp = trimmed_stats(xp, var=False)
            se_paired = sp/np.sqrt(len(xp))
            r = se_paired*tdist.isf(alpha/2, len(xp)-2)
            
            print('  SE  : %.2g' % (se_paired))
            print('  t-test p-value: %.2g%s' % (paired_pval, signif(paired_pval)))
            print('  %g%% CI: (%.2f, %.2f)' % 
                  (100*(1-alpha), inv_xform(mp-r), inv_xform(mp+r)))
        else:
            print('  Not enough data for confidence interval')
        
update_textbox()

In [None]:
################################################################################
#                         FUNCTIONS TO UPDATE FIGURES                          #
################################################################################

def update_figdata(figdata, **kwargs):
    for k, v in kwargs.items():
        figdata[k]=v

# Function that will update all chart elements based on dropdowns
def update(Baseline=cases[0], Comparison=cases[1], Timer=names[0]):
    c1, c2, n = Baseline, Comparison, Timer
    pair_df = paired_data(c1, c2, n)
    with fig.batch_update():
        i = 0
        for line, mode, dash in zip(lines, modes, dashes):
            update_figdata(fig.data[i], x=seqs[c1][n]['date'], y=inv_xform(seqs[c1][n][line]),
                           name=c1 if i<2 else line, customdata=seqs[c1][n][['date']+list(metadata)])
            i += 1
            
            update_figdata(fig.data[i], x=seqs[c2][n]['date'], y=inv_xform(seqs[c2][n][line]),
                           name=c2 if i<2 else line, customdata=seqs[c2][n][['date']+list(metadata)])      
            i += 1
    with diff_fig.batch_update():
        for i, line in enumerate(lines_all):
            update_figdata(diff_fig.data[i], x=pair_df['date'], y=inv_xform(pair_df[line]),
                           customdata=pair_df[['date']+list(metadata)])  
    with hist.batch_update():
        update_figdata(hist.data[0], x=inv_xform(latest_data(pair_df)[1]),
                       xbins={})
    update_textbox(c1, c2, n)

In [None]:
################################################################################
#                               DASHBOARD LAYOUT                               #
################################################################################

from ipywidgets import interactive, HBox, VBox, HTML, Layout
widget = interactive(update, Baseline=list(cases), Comparison=list(cases), Timer=list(names))
controls = HBox(widget.children[:-1], layout = Layout(flex_flow='row wrap'))
report = VBox([
    controls, 
    fig,
    diff_fig,
    HBox([hist, textbox])
])
update()
report

### Performance tests (nightly data)
Changepoints are estimated using a generalized likelihood ratio method on each timer. 
* Markers: recorded wall-clock time or memory
* Solid line: average wall-clock time or memory between changepoints

### Performance Comparison
Observations from the two cases are joined by date, and we take the difference between the log of the comparison and the baseline. A generalized likelihood ratio test is used to determine changepoints in the difference. Relative performance (speedup, slowdown) is shown between the two cases.
* Markers: recorded wall-clock time or memory
* Solid line: average wall-clock time or memory between changepoints
* Dotted lines: upper and lower bounds of a 99% confidence interval for the average. Note: the confidence interval is based on a t-statistic, so for very small amounts of data (<5), the interval may be very large.

### Histogram of relative performance since latest changepoint
Using data since the most recently detected changepoint, we plot a histogram of the relative performance.

### Statistical findings
Summary statistics for the individual timers are shown since their most recent changepoint. For relative performance, we consider data since the most recent changepoint in the difference time series. We use a t-test to evaluate whether the difference in performance is statistically significant. One, two, and three asterisks indicate significance levels of 0.05, 0.01, and 0.001, respectively. We also include a confidence interval for the relative performance; this interval may not be centered on the mean, since the interval is constructed based on the logarithm of times, and then transformed to a relative performance.