## How to Use

1. Run the job with `-B <rate>`.
2. Create `bags.csv` by running the following command in the log directory:
```
r2t2/scripts/parse_worker_info.py -i . -t BAG -o bags.csv
```

**Make sure** you have `numpy`, `matplotlib` and `pandas` installed.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

plt.rcParams['figure.figsize'] = 12, 10
plt.rcParams.update({'font.size': 16})
pd.options.display.max_rows = 20

def show_more(d, lines):
    save = pd.options.display.max_rows
    pd.options.display.max_rows = lines
    display(d)
    pd.options.display.max_rows = save
    
def show_all(d):
    return show_more(d, len(d))

def read_data(path):
    data = pd.read_csv(path, dtype={'bag': object})
    min_timestamp = min(data.timestamp)
    data['timestamp_ms'] = data.timestamp - min_timestamp
    del data['timestamp']
    return data

def show_bag(data, bag_id):
    p = (data[data.bag == bag_id]       
        .sort_values(by=['timestamp_ms'])).reset_index(drop=True)
    
    #def highlighter(x):
    #    return ['background-color: rgba(255, 255, 255, 0.1)' if (x.hop % 2 == 0) else '' for y in x]
    
    #q = p.style.apply(highlighter, axis=1)
    show_more(p, len(p))

In [None]:
bags = read_data("/mnt/data/bags2.csv")

In [None]:
def time_to_finish(x):
    a = x.to_numpy()
    if len(a) < 2:
        return None
    return a[1] - a[0]

d = (bags[bags.action.isin(["Created", "Sealed", "Submitted"])]
         .sort_values(by=['timestamp_ms'])
         .groupby("bag")
         .agg({'timestamp_ms': time_to_finish, 'size': 'max'}))

sizes = d['size'].to_numpy()
times = d.timestamp_ms.to_numpy()

plt.scatter(times, sizes, marker='+', alpha=0.005, s=10)
plt.xlim(0,250)
plt.gcf().set_facecolor('white')
plt.ylabel('sizes (bytes)')
plt.xlabel('time (ms)')
plt.axes().ticklabel_format(scilimits=(0,0))

In [None]:
plt.hist(bags[bags.bag.str.contains("T0/")].groupby("bag").agg({'size': 'max'}).to_numpy(),
         density=True, cumulative=True, bins=200)
plt.gcf().set_facecolor('white')
plt.xlabel('Bag Size (bytes)')
plt.ylabel('CDF')
plt.axes().ticklabel_format(scilimits=(0,0))

In [None]:
d = (bags[bags.action.isin(["Submitted"]) & bags.bag.str.contains("T0/")]
         .sort_values(by=['timestamp_ms'])
         .groupby("bag")
         .agg({'timestamp_ms': 'max', 'size': 'max'}))

sizes = d['size'].to_numpy()
times = d.timestamp_ms.to_numpy()

plt.gcf().set_facecolor('white')
plt.scatter(times, sizes, alpha=0.01, marker='+')
plt.ylabel('sizes (bytes)')
plt.xlabel('time (ms)')
plt.axes().ticklabel_format(scilimits=(0,0))

In [None]:
bags