In [None]:
import mailcom.inout
import mailcom.parse
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
# function for displaying the result using HTML
def highlight_ne(text, per_list, org_list, loc_list, misc_list):
    # create a list of all entities with their positions
    entities = []
    for loc in loc_list:
        entities.append((loc, "green"))
    for org in org_list:
        entities.append((org, "blue"))
    for misc in misc_list:
        entities.append((misc, "yellow"))
    for per in per_list:
        entities.append((per, "red"))
    
    # sort entities by their positions in the text in reverse order
    entities.sort(key=lambda x: text.find(x[0]), reverse=True)
    
    # replace entities with highlighted spans
    for entity, color in entities:
        text = text.replace(entity, f"<span style=\"background-color:{color}\">{entity}</span>")

    return text

In [None]:
def performance_test(batch_size):
    print("-----------------------------------")
    print("Starting performance test for batch size", batch_size)
    # create t0 timestamp
    t0 = time.time()

    # import files from csv file
    email_list = pd.read_csv("../mailcom/test/data/mails_lb_sg_copy.csv")
    #print(email_list)

    t_csv_read = time.time()

    # create pseudonymization object
    ps = mailcom.parse.Pseudonymize()
    ps.init_spacy("fr")
    ps.init_transformers()
    # time stamp after model loading
    t_model_loaded = time.time()

    # loop over mails and pseudonymize them
    out_list = []
    ts_list = []
    for idx, row in email_list.iterrows():
        ts_email_start = time.time()
        text = row["message"]
        email_dict = {"content": text}
        if not text:
            continue
        # Test functionality of Pseudonymize class
        # Pseudonymization is usually done using ps.pseudonymize
        # For performance analysis the process is split into its subprocesses here
        ps.reset()
        sentences = ps.get_sentences(text)
        ts_email_ppr_done = time.time()
        pseudonymized_sentences = []
        for sent in sentences:
            sent = ps.pseudonymize_email_addresses(sent)
            ner = ps.get_ner(sent)
            ps_sent = " ".join(ps.pseudonymize_ne(ner, sent)) if ner else sent
            ps_sent = ps.pseudonymize_numbers(ps_sent)
            pseudonymized_sentences.append(ps_sent)
        output_text = ps.concatenate(pseudonymized_sentences)

        # add output to dict
        email_dict["pseudo_content"] = output_text
        out_list.append(email_dict)

        # timestamp after this email
        ts_email_end = time.time()
        ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])

        # display the pseudonymized text
        display(HTML(output_text))

        # display original text and highlight found and replaced NEs
        highlighted_html = highlight_ne(text, ps.per_list, ps.org_list, ps.loc_list, ps.misc_list)
        display(HTML(highlighted_html))

    # display timestamps

    # bar plot for each individual email
    # processing times
    idx_list = [row[0] for row in email_list.iterrows()]
    email_duration_list = [ts[2] - ts[1] for ts in ts_list]
    email_ppr_list = [ts[1] - ts[0] for ts in ts_list]
    email_total_list = [ts[2] - ts[0] for ts in ts_list]
    email_bar_height = {
        "Pre-Processing": email_ppr_list,
        "Pseudonymization": email_duration_list
    }
    bt = [0 for idx in idx_list]

    plt.figure(figsize=(10,4), dpi=80)

    # plot 1
    plt.subplot(1, 2, 1)
    for key, height in email_bar_height.items():
        plt.bar(idx_list, height, 0.5, label=key, bottom=bt)
        bt = [bi + hi for (bi,hi) in zip(bt, height)]
    #plt.yscale("log")
    plt.xlabel("Email")
    plt.ylabel("t [s]")
    plt.title("Computation times for emails, model loading and file reading")
    plt.legend()

    # plot for model loading and file reading, as well as average email time
    # processing times
    bar_x = ["CSV Reading", "Model Loading", "Average Email Time"]
    average_email_time = sum(email_total_list) / len(email_total_list)
    bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]
    plt.ylabel("t [s]")

    # plot 2
    plt.subplot(1, 2, 2)
    plt.bar(bar_x, bar_y, 0.5)

    # Total time
    print("Total time:", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))

    # plt.savefig("out/mailcom_batching_performance_n_" + str(batch_size) + datetime.datetime.fromtimestamp(t0).strftime('%H%M%S') + ".png")
    print("-----------------------------------")

    return average_email_time

In [None]:
batching_sizes = [-1, -1, 1, 2, 3, 6, 10] # first run is ignored since there seem to be some inconsistencies when loading for the first time
# batching_sizes = [1]
n_samples = 1

av_email_times_for_batches = []
for bs in batching_sizes:
    average_email_time = 0
    for _ in range(n_samples):
        t = performance_test(bs)
        average_email_time += t
    av_email_times_for_batches.append(average_email_time/n_samples)


In [None]:
plt.bar(batching_sizes[1:], av_email_times_for_batches[1:], 0.5)
plt.xlabel("n batches")
plt.ylabel("Average Email Time [s]")
plt.title("Average email time for different batch sizes")