# Transformers Batching Performance Testing

This notebook is intended for testing the pseudonymization performance for different transformers batch sizes.

Make sure to disable Windows Hibernate/Sleep when running long tests or the result will be falsified.

In [None]:
import mailcom.inout
import mailcom.parse
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import difflib

Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size.

The test returns the total email processing time averaged over all emails, as well as a list of all email output dictionaries as created in the pseudonymization:

Email dict structure: |
`content`: Original email text |
`pseudo_content`: Pseudonymized email text |

In [None]:
def performance_test(csv_file, batch_size):
    print("-----------------------------------")
    print("Starting performance test for batch size", batch_size)
    # create t0 timestamp
    t0 = time.time()

    # import files from csv file
    email_list = pd.read_csv(csv_file)
    t_csv_read = time.time()

    # create pseudonymization object
    ps = mailcom.parse.Pseudonymize()
    ps.init_spacy("fr")
    ps.init_transformers()
    ps.set_sentence_batch_size(batch_size)
    # time stamp after model loading
    t_model_loaded = time.time()

    # loop over mails and pseudonymize them
    out_list = []
    ts_list = []
    for idx, row in email_list.iterrows():
        # email start time
        ts_email_start = time.time()
        text = row["message"]
        email_dict = {"content": text}
        if not text:
            continue
        # Pseudonymization is usually done using ps.pseudonymize
        # For performance analysis the process is split into its subprocesses here
        ps.reset()
        sentences = ps.get_sentences(text)
        batches = ps.split_batches(sentences)
        ts_email_ppr_done = time.time() # preprocessing complete
        pseudonymized_batches = []
        for batch in batches:
            batch = ps.concatenate(batch)
            batch = ps.pseudonymize_email_addresses(batch)
            ner = ps.get_ner(batch)
            ps_sent = " ".join(ps.pseudonymize_ne(ner, batch)) if ner else batch
            ps_sent = ps.pseudonymize_numbers(ps_sent)
            pseudonymized_batches.append(ps_sent)
        output_text = ps.concatenate(pseudonymized_batches)

        # add output to dict
        email_dict["pseudo_content"] = output_text
        out_list.append(email_dict)

        # timestamp after this email
        ts_email_end = time.time()
        ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])

    # display timestamps

    # bar plot for each individual email
    # processing times
    idx_list = [row[0] for row in email_list.iterrows()]
    email_duration_list = [ts[2] - ts[1] for ts in ts_list]
    email_ppr_list = [ts[1] - ts[0] for ts in ts_list]
    email_total_list = [ts[2] - ts[0] for ts in ts_list]
    email_bar_height = {
        "Pre-Processing": email_ppr_list,
        "Pseudonymization": email_duration_list
    }
    bt = [0 for idx in idx_list]

    plt.figure(figsize=(10,4), dpi=80)

    # plot 1
    plt.subplot(1, 2, 1)
    for key, height in email_bar_height.items():
        plt.bar(idx_list, height, 0.5, label=key, bottom=bt)
        bt = [bi + hi for (bi,hi) in zip(bt, height)]
    plt.xlabel("Email")
    plt.ylabel("t [s]")
    plt.title("Computation times for emails, model loading and file reading")
    plt.legend()

    # plot for model loading and file reading, as well as average email time
    # processing times
    bar_x = ["CSV Reading", "Model Loading", "Average Email Time"]
    average_email_time = sum(email_total_list) / len(email_total_list)
    bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]
    plt.ylabel("t [s]")

    # plot 2
    plt.subplot(1, 2, 2)
    plt.bar(bar_x, bar_y, 0.5)

    # Total time
    print("Total time:", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))

    # plt.savefig("out/mailcom_batching_performance_n_" + str(batch_size) + datetime.datetime.fromtimestamp(t0).strftime('%H%M%S') + ".png")
    print("-----------------------------------")

    return average_email_time, out_list

Below the testing setup is configured. The tested batching sizes are set in `batching_sizes`. For each batching size, `n_samples` independent runs are executed and the performances are averaged.

In [None]:
batching_sizes = [-1, 1, 2, 3, 4, 6, 8, 10]
# batching_sizes = [1, 10]
n_samples = 5
outputs = []
csv_file = "../mailcom/test/data/mails_lb_sg.csv"

# first make a dummy run since there seem to be some inconsitencies when loading for the first time
_ = performance_test(csv_file, -1)

# testing
for bs in batching_sizes:
    for sid in range(n_samples):
        t, out = performance_test(csv_file, bs)
        test_result_dict = {
            "batch_size": bs,
            "sample": sid,
            "email_outputs": out,
            "average_email_time": t
        }
        outputs.append(test_result_dict)


A bar plot displaying the average email processing times for the different batch sizes

In [None]:
# list for email-averaged total processing time, now to be averaged over n_samples
average_email_times_for_batches = [0]*len(batching_sizes)
# corresponding sample standard deviations
std_email_times_for_batches = [0]*len(batching_sizes)
# manual computation of the average over n_samples
for output in outputs:
    # add times to list position corresponding to batch size
    average_email_times_for_batches[batching_sizes.index(output["batch_size"])] += output["average_email_time"]
# divide by length
average_email_times_for_batches = [avt/n_samples for avt in average_email_times_for_batches]

# if n_samples > 1, calculate sample standard deviation
if n_samples > 1:
    for output in outputs:
        # add squared time deviations to list position corresponding to batch size
        ix = batching_sizes.index(output["batch_size"])
        std_email_times_for_batches[ix] += (output["average_email_time"] - average_email_times_for_batches[ix])**2
    # divide by length-1
    std_email_times_for_batches = [(stdt**(1./2.))/(n_samples-1) for stdt in std_email_times_for_batches]

# plot
plt.errorbar(batching_sizes, average_email_times_for_batches, yerr=std_email_times_for_batches, linestyle='None', marker='.', capsize=2, elinewidth=1)
plt.xlabel("batch size n")
plt.ylabel("Average Email Time [s]")
plt.ylim(bottom=0)
plt.title("Average email time for different batch sizes")
plt.vlines(0, 0, (max(average_email_times_for_batches) + max(std_email_times_for_batches) + 10), colors="gray", linestyles="--")
plt.grid(which='major', color='#666666', linestyle='--', alpha = 0.8)
plt.grid(which='minor', color='#666666', linestyle='--', alpha = 0.3)
plt.minorticks_on()

Testing for quality differences by displaying differences in the pseudonymized text using ``difflib``. Deltas are only printed if there are differences between the pseudonymized texts. Matching ratios are calculated using ``difflib.SequenceMatcher().ratio()``

In [None]:
# declare a result as standard to compare the other results to
standard_batch_size = 1 # batch size with best qualitative results
standard_ps_texts = [email_dict["pseudo_content"] for email_dict in next(output["email_outputs"] for output in outputs if output["batch_size"] == standard_batch_size)]

# iterate over other results and print diffs
for output in outputs:
    print(f"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard {standard_batch_size}: -----")
    # pseudonymized texts for this output
    ps_texts = [email_dict["pseudo_content"] for email_dict in output["email_outputs"]]
    # diff to standard
    average_sqm_ratio = 0. # SequenceMatcherRatio averaged over all emails for this output
    for idx, (text, stdtext) in enumerate(zip(ps_texts, standard_ps_texts)):
        print(f"--- Comparing email text {idx} ---")
        diff = difflib.ndiff(stdtext.splitlines(keepends=True), text.splitlines(keepends=True))
        for line in diff:
            if line.startswith('+ ') or line.startswith('- '):
                print(f"Delta in batch size {output['batch_size']} at sample {output['sample']}:")
                print(line, end='')
        # also test the matching ratio
        rt = difflib.SequenceMatcher(None, stdtext, text).ratio()
        average_sqm_ratio += rt
        if not rt == 1.0:
            print(f"Delta in batch size {output['batch_size']} at sample {output['sample']}: Matching ratio is {rt}")

    average_sqm_ratio = average_sqm_ratio / len(ps_texts)
    output["average_sqm_ratio"] = average_sqm_ratio

In [None]:
# average the SequenceMatcher ratios over n_samples for all batching sizes
average_sqm_ratio_for_batches = [0.]*len(batching_sizes)
# manually average the ratios over n_samples
for output in outputs:
    average_sqm_ratio_for_batches[batching_sizes.index(output["batch_size"])] += output["average_sqm_ratio"]
average_sqm_ratio_for_batches = [asr/n_samples for asr in average_sqm_ratio_for_batches]

# if n_samples > 1, calculate standard deviation.
std_sqm_ratio_for_batches = [0.]*len(batching_sizes)
if n_samples > 1:
    for output in outputs:
        ix = batching_sizes.index(output["batch_size"])
        std_sqm_ratio_for_batches[ix] += (output["average_sqm_ratio"] - average_sqm_ratio_for_batches[ix])**2
    std_sqm_ratio_for_batches = [(stdr**(1./2.))/(n_samples-1) for stdr in std_sqm_ratio_for_batches]

In [None]:
# plot the average SequenceMatcher ratio for batch sizes
plt.errorbar(batching_sizes, average_sqm_ratio_for_batches, yerr=std_sqm_ratio_for_batches, linestyle='None', marker='.', capsize=2, elinewidth=1)
plt.xlabel("batch size n")
plt.ylabel("Average SequenceMatcher Ratio")
plt.title("Average SequenceMatcher Ratio compared to Standard for different batch sizes")
plt.hlines(1, -2, batching_sizes[len(batching_sizes)-1]+1, colors="black")
plt.vlines(0, 1.1, 0, colors="gray", linestyles="--")
plt.xlim(-1.5, batching_sizes[len(batching_sizes)-1]+0.5)
plt.ylim(0,1.1)
plt.grid(which='major', color='#666666', linestyle='--', alpha = 0.8)
plt.grid(which='minor', color='#666666', linestyle='--', alpha = 0.3)
plt.minorticks_on()