In [None]:
import mailcom.inout
import mailcom.parse
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import mailcom.utils
import mailcom.lang_detector
import mailcom.time_detector
import mailcom.main

In [None]:
# create t0 timestamp
t0 = time.time()

In [None]:
# get workflow settings
setting_path = "../mailcom/settings.json"
workflow_settings = mailcom.main.get_workflow_settings(setting_path)

# use default settings
# i.e. enable all steps
lang = ""
pseudo_emailaddresses = True
pseudo_ne = True
pseudo_numbers = True
pseudo_first_names = workflow_settings.get("pseudo_first_names", {})
lang_lib = "langid"
lang_pipeline = None
spacy_model = "default"
ner_pipeline = None

out_file = "../data/out/performance_demo.csv"
in_file = "../data/mails_lb_sg.csv"
# import data from csv file
email_list = pd.read_csv(in_file)

t_csv_read = time.time()

# init necessary objects
spacy_loader = mailcom.utils.SpacyLoader()
trans_loader = mailcom.utils.TransformerLoader()
pseudonymizer = mailcom.parse.Pseudonymize(pseudo_first_names, trans_loader, spacy_loader)
lang_detector = mailcom.lang_detector.LangDetector(trans_loader)
parsing_type = "strict"
time_detector = mailcom.time_detector.TimeDetector(parsing_type, spacy_loader)
# time stamp after loading all necessary objects
t_class_loaded = time.time()

In [None]:
# loop over emails, record time for all steps
out_list = []
ts_list = []
for _, row in email_list.iterrows():
    text = row["message"]
    email = {"content": text}
    ts_email_start = time.time()
    email_content, _ = mailcom.utils.clean_up_content(email["content"])
    email["cleaned_content"] = email_content

    # detect language
    det_langs = lang_detector.get_detections(
        email_content, lang_lib=lang_lib, pipeline_info=lang_pipeline
    )
    lang = det_langs[0][0]  # first detected lang, no prob.
    email["lang"] = lang
    ts_lang_detected = time.time()

    # detect date time
    detected_time = time_detector.get_date_time(
        email_content, lang, model=spacy_model
    )
    email["detected_datetime"] = [
        item[0] for item in detected_time
    ]  # only keep the strings
    ts_datetime_detected = time.time()

    # pseudonymize content
    pseudo_content = pseudonymizer.pseudonymize(
        email_content,
        lang,
        model=spacy_model,
        pipeline_info=ner_pipeline,
        detected_dates=email.get("detected_datetime", None),
        pseudo_emailaddresses=pseudo_emailaddresses,
        pseudo_ne=pseudo_ne,
        pseudo_numbers=pseudo_numbers,
    )
    email["pseudo_content"] = pseudo_content
    email["ne_list"] = pseudonymizer.ne_list
    out_list.append(email)

    ts_email_time_end = time.time()
    # collect time stamps
    ts_list.append([ts_email_start, 
                    ts_lang_detected, 
                    ts_datetime_detected, 
                    ts_email_time_end])

In [None]:
# write output to pandas df
df = pd.DataFrame(out_list)
print(df)

In [None]:
# display timestamps

# bar plot for each individual email
# processing times
idx_list = [row[0] for row in email_list.iterrows()]
email_pseudo_list = [ts[3] - ts[2] for ts in ts_list]
email_datetime_list = [ts[2] - ts[1] for ts in ts_list]
email_lang_list = [ts[1] - ts[0] for ts in ts_list]
email_total_list = [ts[3] - ts[0] for ts in ts_list]
email_bar_height = {
    "Lang Det.": email_lang_list,
    "Date Time Det.": email_datetime_list,
    "Pseudonymization": email_pseudo_list,
}
bt = [0 for _ in idx_list]

plt.figure(figsize=(10,4), dpi=80)

# plot 1
plt.subplot(1, 2, 1)
for key, height in email_bar_height.items():
    plt.bar(idx_list, height, 0.5, label=key, bottom=bt)
    bt = [bi + hi for (bi,hi) in zip(bt, height)]
#plt.yscale("log")
plt.xlabel("Email")
plt.ylabel("t [s]")
plt.title("Computation times for emails, object initializing and file reading")
plt.legend()

# plot for class loading and file reading,
# as well as average email time processing times
bar_x = ["CSV Reading", "Class Loading", "Average Email Time"]
average_email_time = sum(email_total_list) / len(email_total_list)
bar_y = [t_csv_read - t0, t_class_loaded - t_csv_read, average_email_time]
plt.ylabel("t [s]")

# plot 2
plt.subplot(1, 2, 2)
plt.bar(bar_x, bar_y, 0.5)

# Total time
print("Total time:", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][3] - t_class_loaded).strftime('%M:%S')))