# Notebook for testing Corpus 200 emails
*Scientific Software Center, University of Heidelberg, April 2025*

The dataset `Corpus 200 emails` contains 200 multilingual emails (Spanish, English, and Portuguese/Galician) formatted in accordance with the RFC2822 specification. Download the dataset [here](https://figshare.com/articles/dataset/Corpus_200_Emails/1326662?file=1936502)

This notebook will create an evaluation dataset for `mailcom` using 30 emails from `Corpus 200 emails` (10 emails per language).

For each email in the dataset, we record:
* email content
* email language
* detected dates in the email
* list of named entities (NE)
* pseudo content

In [None]:
# mark email numbers for languages
# start with 1
gl_emails = ["01", "02", "03", "04", 10, 12, 15]
gl_files = [str(i) + ".eml" for i in gl_emails]
pt_emails = [30, 36, 66]
pt_files = [str(i) + ".eml" for i in pt_emails]
es_emails = ["05", "06", "07", "09", 11, 23, 28, 31, 33, 34]
es_files = [str(i) + ".eml" for i in es_emails]
en_emails = [13, 14, 19, 20, 22, 24, 32, 35, 37, 38]
en_files = [str(i) + ".eml" for i in en_emails]
chosen_files = gl_files + pt_files + es_files + en_files
assert len(set(chosen_files)) == 30

In [None]:
source_dir = "../../../../eval_data_mailcom"
input_dir = "../../../mailcom/test/data_extended/200_eml"

In [None]:
# copy files from source to input_dir
# run when needed
from pathlib import Path
import shutil
source_files = Path(source_dir).glob("*.eml")
for source_file in source_files:
    if source_file.name in chosen_files:
        shutil.copy(source_file, input_dir)
        print(f"Copied {source_file.name} to {input_dir}")

#### Create a draft version of the dataset

First, we use the language detection, date detection, and pseunonymize from `mailcom` to buil the draft version of the dataset. Each email will be manually checked for validation later.

In [None]:
import mailcom

In [None]:
# activate language detection
new_settings = {"default_lang": ""}
workflow_settings = mailcom.get_workflow_settings(new_settings=new_settings, 
                                                  save_updated_settings=False)

In [None]:
# import files from input_dir
input_handler = mailcom.get_input_handler(in_path=input_dir, in_type="dir")

In [None]:
# process the input data
mailcom.process_data(input_handler.get_email_list(), workflow_settings)

In [None]:
# write output to csv
mailcom.write_output_data(input_handler, "../../../data/eval_data_200_eml.csv", overwrite=True)

#### Manually check and modify each email

In [None]:
import pandas as pd

In [None]:
def print_email(email: dict):
    print("file name:", email["file_name"])
    print("= Email cleaned content =======\n", email["cleaned_content"])
    print("= Email language =======\n", email["lang"])
    print("= Detected dates =======\n", email["detected_datetime"])
    print("= NE list =======")
    for idx, (sent_idx, ne) in enumerate(zip(eval(email["ne_sent"]), eval(email["ne_list"]))):
        print(f"  {idx}- sentence {sent_idx}, {ne["word"]} - {ne["entity_group"]} - {ne["start"]} - {ne["end"]} - {ne["pseudonym"]}")
    print("= Sentences =======\n")
    for idx, sent in enumerate(eval(email["sentences"])):
        print(f"  {idx}- {sent}")
    print("= Pseudo content =======\n", email["pseudo_content"])

In [None]:
def check_email_lang(file_name, lang) -> bool:
    if file_name in gl_files and lang == "gl":
        return True
    elif file_name in pt_files and lang == "pt":
        return True
    elif file_name in es_files and lang == "es":
        return True
    elif file_name in en_files and lang == "en":
        return True
    else:
        print("Incorrect detected language for file:", file_name)
        return False

In [None]:
# read output from last updated csv
old_email_idx = 0
df = pd.read_csv(f"../../../data/eval_data_200_eml_idx{old_email_idx}.csv")

In [None]:
# manually check each email
email_idx = 0
email = df.iloc[email_idx]
print_email(email)
if check_email_lang(email["file_name"], email["lang"]):
    print("Correct detected language for file:", email["file_name"])

In [None]:
# mark incorrect indices of NE
incorrect_ne_indices = [1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
def remove_incorrect_ne_indices(email, incorrect_ne_indices):
    ne_list = eval(email["ne_list"])
    ne_sent = eval(email["ne_sent"])
    # remove from the end to avoid shifting indices
    for idx in sorted(incorrect_ne_indices, reverse=True):
        del ne_list[idx]
        del ne_sent[idx]
    return ne_list, ne_sent

In [None]:
# remove incorrect indices from ne_list and ne_sent of email
email_ne_list, email_ne_sent = remove_incorrect_ne_indices(email, incorrect_ne_indices)

In [None]:
# update email in df
df.at[email_idx, "ne_list"] = email_ne_list
df.at[email_idx, "ne_sent"] = email_ne_sent

In [None]:
df.iloc[email_idx]

In [None]:
# save updated df to csv
df.to_csv(f"../../../data/eval_data_200_eml_idx{email_idx}.csv", index=False)