# Notebook for testing Corpus 200 emails
*Scientific Software Center, University of Heidelberg, April 2025*

The dataset `Corpus 200 emails` contains 200 multilingual emails (Spanish, English, and Portuguese/Galician) formatted in accordance with the RFC2822 specification. Download the dataset [here](https://figshare.com/articles/dataset/Corpus_200_Emails/1326662?file=1936502)

This notebook will create an evaluation dataset for `mailcom` using 30 emails from `Corpus 200 emails` (10 emails per language).

For each email in the dataset, we record:
* email content
* email language
* detected dates in the email
* list of named entities (NE)
* pseudo content

In [None]:
# mark email numbers for languages
# start with 1
gl_emails = ["01", "02", "03", "04", 10, 12, 15]
gl_files = [str(i) + ".eml" for i in gl_emails]
pt_emails = [30, 36, 66]
pt_files = [str(i) + ".eml" for i in pt_emails]
es_emails = ["05", "06", "07", "09", 11, 23, 28, 31, 33, 34]
es_files = [str(i) + ".eml" for i in es_emails]
en_emails = [13, 14, 19, 20, 22, 24, 32, 35, 37, 38]
en_files = [str(i) + ".eml" for i in en_emails]
chosen_files = gl_files + pt_files + es_files + en_files
assert len(set(chosen_files)) == 30

In [None]:
source_dir = "../../../../eval_data_mailcom"
input_dir = "../../../mailcom/test/data_extended/200_eml"

#### Copy files

In [None]:
# run when needed!
# copy files from source to input_dir
from pathlib import Path
import shutil
source_files = Path(source_dir).glob("*.eml")
for source_file in source_files:
    if source_file.name in chosen_files:
        shutil.copy(source_file, input_dir)
        print(f"Copied {source_file.name} to {input_dir}")

#### Create a draft version of the dataset

First, we use the language detection, date detection, and pseunonymize from `mailcom` to buil the draft version of the dataset. Each email will be manually checked for validation later.

In [None]:
import mailcom

In [None]:
# activate language detection
new_settings = {"default_lang": ""}
workflow_settings = mailcom.get_workflow_settings(new_settings=new_settings, 
                                                  save_updated_settings=False)

In [None]:
# import files from input_dir
input_handler = mailcom.get_input_handler(in_path=input_dir, in_type="dir")

In [None]:
# process the input data
mailcom.process_data(input_handler.get_email_list(), workflow_settings)

In [None]:
# write output to csv
mailcom.write_output_data(input_handler, "../../../data/eval_data_200_eml.csv", overwrite=True)

#### Manually check and modify each email

In [None]:
import pandas as pd

In [None]:
def print_email(email: dict):
    print("file name:", email["file_name"])
    print("= Email cleaned content =======\n", email["cleaned_content"])
    print("= Email language =======\n", email["lang"])
    print("= Detected dates =======\n", email["detected_datetime"])
    print("= NE list =======")
    for idx, (sent_idx, ne) in enumerate(zip(eval(email["ne_sent"]), eval(email["ne_list"]))):
        print(f"  {idx}- sentence {sent_idx}, {ne["word"]} - {ne["entity_group"]} - {ne["start"]} - {ne["end"]} - {ne["pseudonym"]}")
    print("= Sentences =======\n")
    for i, sent in enumerate(eval(email["sentences"])):
        print(f"  {i}- {sent}")
    print("= Pseudo content =======\n", email["pseudo_content"])

In [None]:
def check_email_lang(file_name, lang) -> bool:
    if file_name in gl_files and lang == "gl":
        return True
    elif file_name in pt_files and lang == "pt":
        return True
    elif file_name in es_files and lang == "es":
        return True
    elif file_name in en_files and lang == "en":
        return True
    else:
        print("Incorrect detected language for file:", file_name)
        return False

In [None]:
# mark email indices that are already checked
checked_rows = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
old_row_idx = None if not checked_rows else checked_rows[-1]

In [None]:
# read output from last updated csv
if old_row_idx is None:
    # read the original csv
    df = pd.read_csv("../../../data/eval_data_200_eml.csv")
else:
    # read the updated csv
    tmp_row_idx = old_row_idx
    while tmp_row_idx >= 0:
        try:
            df = pd.read_csv(f"../../../data/eval_data_200_eml_idx{tmp_row_idx}.csv")
            print("Read file:", f"../../../data/eval_data_200_eml_idx{tmp_row_idx}.csv")
            break
        except FileNotFoundError:
            tmp_row_idx -= 1

In [None]:
# manually check each email
row_idx = old_row_idx + 1 if old_row_idx is not None else 0
email = df.iloc[row_idx]
if check_email_lang(email["file_name"], email["lang"]):
    print("Correct detected language for file:", email["file_name"])
else:
    print("Incorrect detected language for file:", email["file_name"])
print_email(email)

In [None]:
# mark incorrect indices of NE -- compared to the original
incorrect_ne_indices = {}
incorrect_ne_indices["0"] = {"u": [2]}
incorrect_ne_indices["1"] = {"u": [4, 8],
                             "r": [7],
                             "a": [2]} # the "r" and "a" cases are updated manually
incorrect_ne_indices["2"] = {"u": [2]}
incorrect_ne_indices["4"] = {"u": [6, 8, 9],
                             "r": [10],
                             "a": [4, 5, 6]}
incorrect_ne_indices["5"] = {"u": [9],
                             "r": [],
                             "a": []}
incorrect_ne_indices["6"] = {"u": [6],
                             "r": [5],
                             "a": []}
incorrect_ne_indices["7"] = {"u": [],
                             "r": [10, 11],
                             "a": [22]}
incorrect_ne_indices["8"] = {"u": [1, 4, 5, 6 ,7, 9, 10, 15, 20, 22, 27],
                             "r": [23, 31],
                             "a": [8],
                             "r-c": [2, 3]}
incorrect_ne_indices["9"] = {"u": [17],
                             "r": [],
                             "a": [],
                             "r-c": []}
incorrect_ne_indices["10"] = {"u": [1, 19],
                             "r": [23],
                             "a": [],
                             "r-c": [13, 20]}

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

def find_left_offset(old_word, new_word):
    old_longer = len(old_word) > len(new_word)
    # find old word in the new word
    if old_longer:
        start_offset = old_word.find(new_word)
    else:
        start_offset = new_word.find(old_word)

    left_offset = start_offset if old_longer else - start_offset
    return left_offset

In [None]:
# skip below lines in this cell if no update is needed
# update for every email
item_idx = 1
old_word = ne_list[item_idx]["word"]
ne_list[item_idx]["word"] = "Expert Systems With Applications"
l_offset = find_left_offset(old_word, ne_list[item_idx]["word"])
ne_list[item_idx]["start"] += l_offset
ne_list[item_idx]["end"] = ne_list[item_idx]["start"] + len(ne_list[item_idx]["word"])

item_idx = 19
old_word = ne_list[item_idx]["word"]
ne_list[item_idx]["word"] = "OX5 1GB"
l_offset = find_left_offset(old_word, ne_list[item_idx]["word"])
ne_list[item_idx]["start"] += l_offset
ne_list[item_idx]["end"] = ne_list[item_idx]["start"] + len(ne_list[item_idx]["word"])

In [None]:
def remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices):
    new_ne_list = []
    new_ne_sent = []
    for idx, ne in enumerate(ne_list):
        if idx in incorrect_ne_indices:
            # remove the ne from the list
            continue
        else:
            new_ne_list.append(ne)
            new_ne_sent.append(ne_sent[idx])
    return new_ne_list, new_ne_sent

In [None]:
# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

In [None]:
# add new ne, if any
# update for every email
# n_word = "elsevier"
# n_pseudonym = "[organization]"
# n_entity_group = "ORG"
# sent_idx = 10
# insert_idx = 22
# new_ne = {"entity_group": n_entity_group, "word": n_word, "pseudonym": n_pseudonym}
# new_ne["start"] = eval(email["sentences"])[sent_idx].find(new_ne["word"])
# new_ne["end"] = new_ne["start"] + len(new_ne["word"])
# ne_list.insert(insert_idx, new_ne)
# ne_sent.insert(insert_idx, sent_idx)

In [None]:
# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

In [None]:
# generate new pseudo content
import mailcom
from mailcom.parse import Pseudonymize

# get workflow settings
new_settings = {"default_lang": ""}
workflow_settings = mailcom.get_workflow_settings(new_settings=new_settings, 
                                                  save_updated_settings=False)
pseudonymizer = Pseudonymize(workflow_settings.get("pseudo_first_names", {}))

# get updated values
email = df.iloc[row_idx]

# get NE for each sentence in the email
ne_sent_dict = {}
for sent_idx, ne in zip(email["ne_sent"], email["ne_list"]):
    if str(sent_idx) not in ne_sent_dict:
        ne_sent_dict[str(sent_idx)] = []
    ne_sent_dict[str(sent_idx)].append(ne)

updated_pseudo_content = pseudonymizer.pseudonymize_with_updated_ne(eval(email["sentences"]),
                                                                    ne_sent_dict,
                                                                    email["lang"],
                                                                    eval(email["detected_datetime"]),
                                                                    workflow_settings.get("pseudo_emailaddresses", True),
                                                                    workflow_settings.get("pseudo_ne", True),
                                                                    workflow_settings.get("pseudo_numbers", True))

In [None]:
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any
# df.at[row_idx, "detected_datetime"] = [""]

In [None]:
# double check
df.iloc[row_idx]

In [None]:
# save updated df to csv
df.to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)