In [None]:
import mailcom.inout
import mailcom.parse
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import mailcom.utils

In [None]:
# create t0 timestamp
t0 = time.time()

In [None]:
out_file = "../data/out/lang_detection_from_csv.csv"
# import files from csv file
email_list = pd.read_csv("../mailcom/test/data_extended/mails_lb_sg.csv")
print(email_list)

t_csv_read = time.time()

# create pseudonymization object
ps = mailcom.parse.Pseudonymize()
ps.init_spacy("fr")
ps.init_transformers()
ld = mailcom.utils.LangDetector()
ld.init_transformers()
# time stamp after model loading
t_model_loaded = time.time()

In [None]:
# loop over mails and detect their languages
lang_out_list = []
lang_ts_list = []
for idx, row in email_list.iterrows():
    ts_email_start = time.time()
    text = row["message"]
    email_dict = {"content": text}
    if not text:
        continue
    # Test functionality of LangDetector class
    ts_email_ppr_done = time.time()
    for lang_lib in ["langid", "langdetect", "trans"]:
        det_lang = ld.get_detections(text, lang_lib)
        email_dict[lang_lib] = "{}-{}".format(det_lang[0][0], det_lang[0][1])

    lang_out_list.append(email_dict)

    # timestamp after this email
    ts_email_end = time.time()
    lang_ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])

In [None]:
# write output to pandas df
df = pd.DataFrame(lang_out_list)
print(df)

In [None]:
# store the results in a csv file
df.to_csv(out_file, index=False)

In [None]:
for item in df["trans"][90:104]:
    print(item)

In [None]:
# loop over mails and pseudonymize them
out_list = []
ts_list = []
for idx, row in email_list.iterrows():
    ts_email_start = time.time()
    text = row["message"]
    email_dict = {"content": text}
    if not text:
        continue
    # Test functionality of Pseudonymize class
    # Pseudonymization is usually done using ps.pseudonymize
    # For performance analysis the process is split into its subprocesses here
    ps.reset()
    sentences = ps.get_sentences(text)
    ts_email_ppr_done = time.time()
    pseudonymized_sentences = []
    for sent in sentences:
        sent = ps.pseudonymize_email_addresses(sent)
        ner = ps.get_ner(sent)
        ps_sent = " ".join(ps.pseudonymize_ne(ner, sent)) if ner else sent
        ps_sent = ps.pseudonymize_numbers(ps_sent)
        pseudonymized_sentences.append(ps_sent)
    output_text = ps.concatenate(pseudonymized_sentences)

    # add output to dict
    email_dict["pseudo_content"] = output_text
    out_list.append(email_dict)

    # timestamp after this email
    ts_email_end = time.time()
    ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])

In [None]:
# write output to pandas df
df = pd.DataFrame(out_list)
print(df)

In [None]:
# display timestamps

# bar plot for each individual email
# processing times
idx_list = [row[0] for row in email_list.iterrows()]
email_duration_list = [ts[2] - ts[1] for ts in ts_list]
email_ppr_list = [ts[1] - ts[0] for ts in ts_list]
email_total_list = [ts[2] - ts[0] for ts in ts_list]
email_bar_height = {
    "Pre-Processing": email_ppr_list,
    "Pseudonymization": email_duration_list
}
bt = [0 for idx in idx_list]

plt.figure(figsize=(10,4), dpi=80)

# plot 1
plt.subplot(1, 2, 1)
for key, height in email_bar_height.items():
    plt.bar(idx_list, height, 0.5, label=key, bottom=bt)
    bt = [bi + hi for (bi,hi) in zip(bt, height)]
#plt.yscale("log")
plt.xlabel("Email")
plt.ylabel("t [s]")
plt.title("Computation times for emails, model loading and file reading")
plt.legend()

# plot for model loading and file reading, as well as average email time
# processing times
bar_x = ["CSV Reading", "Model Loading", "Average Email Time"]
average_email_time = sum(email_total_list) / len(email_total_list)
bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]
plt.ylabel("t [s]")

# plot 2
plt.subplot(1, 2, 2)
plt.bar(bar_x, bar_y, 0.5)

# Total time
print("Total time:", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))