# Compare forward slices

Compare nbslicer and nbgather forward slices.

In [1]:
import ast
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import seaborn as sns
import sqlite3

In [3]:
# Read results

# Read nbsafety file
f = open("../eda/forward/results/nbsafety_stats.txt", "r+")
lines = f.readlines()[1:]
lines = [line[1:-2].split(",") for line in lines]
nbsafety_processed_trace_sessions = {
    (int(line[0].strip()), int(line[1].strip())): (int(line[-3].strip()), int(line[-2].strip()), int(line[-1].strip()))
    for line in lines
}
                                                   
program_cells = {
    (int(line[0].strip()), int(line[1].strip())): ",".join(line[2:-3])[2:-1]
    for line in lines
}

# Read nbgather file
f = open("../eda/forward/results/nbgather_stats.txt", "r+")
lines = f.readlines()[1:]
lines = [line[1:-2].split(",") for line in lines]
nbgather_processed_trace_sessions = {
    (int(line[0].strip()), int(line[1].strip())): int(line[2].strip()) for line in lines
}
                                                   
conn = sqlite3.connect('../data/traces.sqlite', isolation_level=None)
cursor = conn.cursor()
program_lengths = {}
stmt_lengths = {}
                                                   
for k, parsed_cells in program_cells.items():
    TRACE_ID, SESSION_ID = k
    df = pd.read_sql_query(f"SELECT cell_execs.* from cell_execs WHERE trace={TRACE_ID} and session={SESSION_ID} and counter IN ({parsed_cells})", conn)
    lines = [elem for elem in '\n'.join(df['source'].to_list()).split('\n') if elem.strip() != '']
    program_lengths[k] = len(lines)
    try:
        stmt_lengths[k] = len(ast.parse('\n'.join(df['source'].to_list())).body)
    except:
        stmt_lengths[k] = len(lines)

In [11]:
nbgather_processed_trace_sessions

{(9, 16): 1,
 (9, 75): 1,
 (9, 81): 17,
 (9, 85): 9,
 (9, 138): 1,
 (9, 178): 2,
 (9, 193): 6,
 (9, 195): 1,
 (9, 196): 1,
 (9, 332): 1,
 (9, 455): 2,
 (9, 507): 1,
 (11, 861): 18,
 (26, 179): 1,
 (26, 305): 1,
 (31, 13): 23,
 (32, 1): 14,
 (36, 7): 1,
 (36, 13): 8,
 (36, 122): 43,
 (73, 25): 1,
 (80, 35): 1,
 (88, 153): 1,
 (114, 36): 95,
 (121, 25): 19,
 (135, 57): 1,
 (135, 330): 5,
 (135, 331): 8,
 (135, 334): 2,
 (135, 349): 1,
 (135, 445): 9,
 (135, 708): 4,
 (135, 710): 7,
 (135, 711): 8,
 (135, 774): 1,
 (135, 816): 46,
 (135, 831): 14,
 (156, 49): 5,
 (165, 153): 8,
 (182, 25): 3,
 (183, 952): 1,
 (183, 1696): 4,
 (191, 118): 1,
 (191, 220): 1,
 (191, 226): 1,
 (195, 5): 4,
 (198, 1061): 35,
 (198, 1122): 2,
 (204, 5): 1,
 (215, 4): 4,
 (215, 25): 1,
 (217, 25): 10,
 (217, 50): 1,
 (217, 95): 2,
 (217, 141): 18,
 (217, 142): 25,
 (217, 147): 21,
 (217, 151): 8,
 (217, 170): 1,
 (217, 171): 2,
 (217, 184): 18,
 (217, 185): 1,
 (217, 186): 38,
 (217, 189): 1,
 (217, 195): 1,
 (2

In [13]:
# Count number of statements in the slices

import os
import ast

nbsafety_num_stmts = {}
for session_filename in os.listdir("../eda/forward/results/nbsafety"):
    if "txt" not in session_filename:
        continue
    trace = int(session_filename.split('_')[0])
    session = int(session_filename.split('_')[1].split('.')[0])
    program = open(f"../eda/forward/results/nbsafety/{session_filename}", "r+").read()
    nbsafety_num_stmts[(trace, session)] = len(ast.parse(program).body)

nbgather_num_stmts = {}
for session_filename in os.listdir("../eda/forward/results/nbgather"):
    if "txt" not in session_filename:
        continue
    trace = int(session_filename.split('_')[0])
    session = int(session_filename.split('_')[1].split('.')[0])
    program = open(f"../eda/forward/results/nbgather/{session_filename}", "r+").read()
    try:
        nbgather_num_stmts[(trace, session)] = len(ast.parse(program).body)
    except:
        if (trace, session) in nbgather_processed_trace_sessions:
            nbgather_num_stmts[(trace, session)] = nbgather_processed_trace_sessions[(trace, session)] 

In [15]:
nbsafety_df = pd.DataFrame({"key": nbsafety_processed_trace_sessions.keys(), "nbsafety_line_size": nbsafety_processed_trace_sessions.values()})
nbgather_df = pd.DataFrame({"key": nbgather_processed_trace_sessions.keys(), "nbgather_line_size": nbgather_processed_trace_sessions.values()})
line_length_df = pd.DataFrame({"key": program_lengths.keys(), "num_lines": program_lengths.values()})
stmt_length_df = pd.DataFrame({"key": stmt_lengths.keys(), "num_stmts": stmt_lengths.values()})

nbsafety_stmt_df =  pd.DataFrame({"key": nbsafety_num_stmts.keys(), "nbsafety_num_stmts": nbsafety_num_stmts.values()})
nbgather_stmt_df =  pd.DataFrame({"key": nbgather_num_stmts.keys(), "nbgather_num_stmts": nbgather_num_stmts.values()})

full_df = nbsafety_df.merge(nbgather_df, on=['key'], how='inner').merge(nbgather_df, on=['key'], how='inner').merge(line_length_df, on=['key'], how='inner').merge(stmt_length_df,  on=['key'], how='inner').merge(nbsafety_stmt_df, on=['key'], how='inner').merge(nbgather_stmt_df, on=['key'], how='inner')
full_df[['cell_to_rerun', 'nbsafety_cell_size', 'nbsafety_line_size']] = pd.DataFrame(full_df['nbsafety_line_size'].to_list(), index=full_df.index)

In [16]:
# Cut off the nbgather num statements

full_df["nbgather_num_stmts"] = full_df["nbgather_num_stmts"].where(full_df["nbgather_num_stmts"] < full_df["num_stmts"], full_df["num_stmts"])

In [17]:
full_df['nbsafety_slice_frac'] = full_df['nbsafety_num_stmts'] / full_df['num_stmts']
full_df['nbgather_slice_frac'] = full_df['nbgather_num_stmts'] / full_df['num_stmts']
full_df['diff_frac'] = (full_df['nbgather_num_stmts'] - full_df['nbsafety_num_stmts']) / full_df['num_stmts']

In [18]:
full_df[["nbsafety_slice_frac", "nbgather_slice_frac", "diff_frac"]].describe()

Unnamed: 0,nbsafety_slice_frac,nbgather_slice_frac,diff_frac
count,328.0,328.0,328.0
mean,0.093295,0.201735,0.10844
std,0.202751,0.268158,0.199596
min,0.003205,0.003205,-0.238636
25%,0.015625,0.019003,0.0
50%,0.021739,0.071429,0.0
75%,0.052632,0.296474,0.133333
max,1.0,1.0,0.95


In [19]:
from scipy.stats import wilcoxon

w, p = wilcoxon(full_df['nbgather_slice_frac'].values, full_df['nbsafety_slice_frac'].values)

print(w)
print(p)

282.0
8.77793219758263e-24


In [20]:
plt.rcParams.update({
    "font.family": "serif",  # use serif/main font for text elements
    "text.usetex": True,     # use inline math for ticks
    "pgf.rcfonts": False,    # don't setup fonts from rc parameters
    "font.size": 12,
    "axes.labelsize": "large",
    "pgf.texsystem": "pdflatex",
    "pgf.preamble": [
        r'\usepackage[T1]{fontenc}',
        r'\usepackage[utf8]{inputenc}',
        r'\usepackage{amsmath}',
        r'\usepackage{newtxtext}',
        r'\usepackage{newtxmath}',
#         r'\usepackage[lite,subscriptcorrection,slantedGreek,nofontinfo,amsbb,eucal]{mtpro2}'
    ]
})
sns.set_style('ticks')
mpl.use('pgf')

  self[key] = other[key]


In [21]:
def plot_num_lines_histogram(df, filename=None):
    plt.clf()
    plt.figure(figsize=(6,3))
    plt.grid(linestyle=':')
    bins = np.linspace(0, 1, 50)
    plt.hist(df['nbgather_slice_frac'], bins=bins, alpha=0.3)
    plt.hist(df['nbsafety_slice_frac'], bins=bins, alpha=0.3)
    plt.legend((r'\textrm{\texttt{nbgather}}', r'\textrm{\textsc{nbslicer}}'), loc='upper right')
    plt.xlabel(r"\textrm{Slice size}")
    plt.ylabel(r"\textrm{Count}")
#     plt.title(f"Histogram of number of lines ({len(df)} sessions)")
#     plt.show()
    plt.tight_layout()
    if filename:
        plt.savefig(filename)
        
plot_num_lines_histogram(full_df, "slice_size_hist_forward.pgf")