# Notebook for creating eval. data from Corpus 200 emails
*Scientific Software Center, University of Heidelberg, April 2025*

The dataset `Corpus 200 emails` contains 200 multilingual emails (Spanish, English, and Portuguese/Galician) formatted in accordance with the RFC2822 specification. Download the dataset [here](https://figshare.com/articles/dataset/Corpus_200_Emails/1326662?file=1936502)

This notebook will create an evaluation dataset for `mailcom` using 30 emails from `Corpus 200 emails` (10 emails per language).

For each email in the dataset, we record:
* email content
* email language
* detected dates in the email
* list of named entities (NE)
* pseudo content
* list of sentences
* list of sentences after email pseudonymization

To run the notebook in one go, toggle the following sections to collapse their cells:
* `General settings`
* `Copy files (run once)`
* `Create a drafe version of the datset (run once)`
* `Manually check and modify each email`

Then run all cells in each section at once.

All `csv` files are created under `"../../../data/"`, including:
* parse results obtained using `mailcom` (`eval_data_200_eml.csv`)
* updated data for each `eml` file (`eval_data_200_eml_idx{i}.csv` with `i` $\in [0..num\_of\_files)$)
* concatenated `csv` of updated data for all `eml` files (`checked_eval_data_200_eml_{num_of_files}_emails.csv`)

#### General settings

In [None]:
# mark email numbers for languages
# start with 1
gl_emails = ["01", "02", "03", "04", 10, 12, 15]
gl_files = [str(i) + ".eml" for i in gl_emails]
pt_emails = [30, 36, 66] # 30 is pt accoring to Google Translate but it is actually es
pt_files = [str(i) + ".eml" for i in pt_emails]
es_emails = ["05", "06", "07", "09", 11, 23, 28, 31, 33, 34]
es_files = [str(i) + ".eml" for i in es_emails]
en_emails = [13, 14, 19, 20, 22, 24, 32, 35, 37, 38]
en_files = [str(i) + ".eml" for i in en_emails]
chosen_files = gl_files + pt_files + es_files + en_files
assert len(set(chosen_files)) == 30

In [None]:
source_dir = "../../../../eval_data_mailcom"
input_dir = "../../../mailcom/test/data_extended/200_eml"

In [None]:
misc_group = "MISC"
misc_pseudo = "[misc]"
org_group = "ORG"
org_pseudo = "[organization]"
loc_group = "LOC"
loc_pseudo = "[location]"

# repeated words, SonarQube
repeated_words = ["Expert Systems With Applications", 
                  "Expert Systems with Applications"]

#### Copy files (run once)

In [None]:
# run when needed!
# copy files from source to input_dir
from pathlib import Path
import shutil
source_files = Path(source_dir).glob("*.eml")
for source_file in source_files:
    if source_file.name in chosen_files:
        shutil.copy(source_file, input_dir)
        print(f"Copied {source_file.name} to {input_dir}")

#### Create a draft version of the dataset (run once)

First, we use the language detection, date detection, and pseunonymize from `mailcom` to buil the draft version of the dataset. Each email will be manually checked for validation later.

In [None]:
import mailcom

In [None]:
# activate language detection
new_settings = {"default_lang": ""}
workflow_settings = mailcom.get_workflow_settings(new_settings=new_settings, 
                                                  save_updated_settings=False)

In [None]:
# import files from input_dir
input_handler = mailcom.get_input_handler(in_path=input_dir, in_type="dir")

In [None]:
# process the input data
mailcom.process_data(input_handler.get_email_list(), workflow_settings)

In [None]:
# write output to csv
mailcom.write_output_data(input_handler, "../../../data/eval_data_200_eml.csv", overwrite=True)

#### Manually check and modify each email

##### Preparing data

In [None]:
# define necessary functions

import pandas as pd
import mailcom
from mailcom.parse import Pseudonymize


# get workflow settings
new_settings = {"default_lang": ""}
workflow_settings = mailcom.get_workflow_settings(new_settings=new_settings, 
                                                  save_updated_settings=False)
pseudonymizer = Pseudonymize(workflow_settings.get("pseudo_first_names", {}))


def print_email(email: dict):
    print("file name:", email["file_name"])
    print("= Email cleaned content =======\n", email["cleaned_content"])
    print("= Email language =======\n", email["lang"])
    print("= Detected dates =======\n", email["detected_datetime"])
    print("= NE list =======")
    for idx, (sent_idx, ne) in enumerate(zip(eval(email["ne_sent"]), eval(email["ne_list"]))):
        print(f"  {idx}- sentence {sent_idx}, {ne["word"]} - {ne["entity_group"]} - {ne["start"]} - {ne["end"]} - {ne["pseudonym"]}")
    print("= Sentences =======\n")
    for i, sent in enumerate(eval(email["sentences"])):
        print(f"  {i}- {sent}")
    print("= Pseudo content =======\n", email["pseudo_content"])


def check_email_lang(file_name, lang) -> bool:
    if file_name in gl_files and lang == "gl":
        return True
    elif file_name in pt_files and lang == "pt":
        return True
    elif file_name in es_files and lang == "es":
        return True
    elif file_name in en_files and lang == "en":
        return True
    else:
        print("Incorrect detected language for file:", file_name)
        return False


def find_left_offset(old_word, new_word):
    old_longer = len(old_word) > len(new_word)
    # find old word in the new word
    if old_longer:
        start_offset = old_word.find(new_word)
    else:
        start_offset = new_word.find(old_word)

    left_offset = start_offset if old_longer else - start_offset
    return left_offset


def update_ne_word(item_idx, new_word, ne_list):
    old_word = ne_list[item_idx]["word"]
    ne_list[item_idx]["word"] = new_word
    l_offset = find_left_offset(old_word, ne_list[item_idx]["word"])
    ne_list[item_idx]["start"] += l_offset
    ne_list[item_idx]["end"] = ne_list[item_idx]["start"] + len(ne_list[item_idx]["word"])


def update_ne_group(item_idx, new_group, new_pseudo, ne_list):
    ne_list[item_idx]["entity_group"] = new_group
    ne_list[item_idx]["pseudonym"] = new_pseudo


def remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices):
    new_ne_list = []
    new_ne_sent = []
    for idx, ne in enumerate(ne_list):
        if idx in incorrect_ne_indices:
            # remove the ne from the list
            continue
        else:
            new_ne_list.append(ne)
            new_ne_sent.append(ne_sent[idx])
    return new_ne_list, new_ne_sent


def find_nth_occurrence(sentence, word, occurrence):
    import re
    pattern = re.compile(re.escape(word))
    matches = [m.start() for m in pattern.finditer(sentence)]
    return matches[occurrence - 1] if len(matches) >= occurrence else -1


def add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_e_group, sentence, ocurrence, ne_list, ne_sent):
    new_ne = {"entity_group": n_e_group, "word": n_word, "pseudonym": n_pseudonym}
    # important!
    # indices are determined after email pseudonymization!
    sentence = pseudonymizer.pseudonymize_email_addresses(sentence)
    # find the start and end of the new ne
    new_ne["start"] = find_nth_occurrence(sentence, n_word, ocurrence)
    new_ne["end"] = new_ne["start"] + len(new_ne["word"])
    ne_list.insert(insert_idx, new_ne)
    ne_sent.insert(insert_idx, sent_idx)


def get_new_pseudo_content(df, row_idx):
    # get updated values
    email = df.iloc[row_idx]

    # get NE for each sentence in the email
    ne_sent_dict = {}
    for sent_idx, ne in zip(email["ne_sent"], email["ne_list"]):
        if str(sent_idx) not in ne_sent_dict:
            ne_sent_dict[str(sent_idx)] = []
        ne_sent_dict[str(sent_idx)].append(ne)

    # note. the indices of NE are only correct after the email is pseudonymized!
    updated_pseudo_content = pseudonymizer.pseudonymize_with_updated_ne(eval(email["sentences_after_email"]),
                                                                        ne_sent_dict,
                                                                        email["lang"],
                                                                        eval(email["detected_datetime"]),
                                                                        workflow_settings.get("pseudo_emailaddresses", True),
                                                                        workflow_settings.get("pseudo_ne", True),
                                                                        workflow_settings.get("pseudo_numbers", True))
    return updated_pseudo_content

In [None]:
# read the original csv
df = pd.read_csv("../../../data/eval_data_200_eml.csv")

In [None]:
# mark incorrect indices of NE -- compared to the original
incorrect_ne_indices = {}
incorrect_ne_indices["0"] = {"u": [2]}
incorrect_ne_indices["1"] = {"u": [4],
                             "r": [7],
                             "a": [2]} # the "r" and "a" cases are updated manually
incorrect_ne_indices["2"] = {"u": [2]}
incorrect_ne_indices["3"] = {"u": [1],
                             "r": [3]}
incorrect_ne_indices["4"] = {"u": [6, 8, 9],
                             "r": [10],
                             "a": [6, 4, 5]}
incorrect_ne_indices["5"] = {"u": [9]}
incorrect_ne_indices["6"] = {"u": [6],
                             "r": [5],
                             "a": [2]}
incorrect_ne_indices["7"] = {"a": [22],
                             "r": [10, 11]}
incorrect_ne_indices["8"] = {"u": [1, 4, 5, 6 ,7, 9, 10, 15, 20, 22, 27],
                             "r": [23, 31],
                             "a": [8],
                             "r-c": [2, 3]}
incorrect_ne_indices["9"] = {"u": [17]}
incorrect_ne_indices["10"] = {"u": [1, 19],
                              "r": [13, 20, 23]}
incorrect_ne_indices["11"] = {"u": [6, 9, 11, 13, 18, 20, 21],
                              "a": [25],
                              "r": [24],
                              "a-c": [12, 7],
                              "r-c": [5],
                              "a-c-c": [4, 2, 1]}
incorrect_ne_indices["12"] = {"u": [0, 5],
                              "a": [8, 9],
                              "r": [6],
                              "a-c": [1, 0]}
incorrect_ne_indices["13"] = {"r": [3]}
incorrect_ne_indices["14"] = {"u": [4, 10],
                              "r": [11, 6],
                              "a": [2, 1]}
incorrect_ne_indices["15"] = {"u": [11, 13],
                              "r": [12, 14],
                              "a": [3, 3]}
incorrect_ne_indices["16"] = {"u": [9, 16, 28, 40, 42, 44, 45, 51, 52, 55, 81, 82, 84, 87, 88],
                              "r": [89],
                              "a": [88, 88, 60, 58, 56, 56, 55, 53, 51, 41, 41, 40, 40],
                              "r-c": [22, 13],
                              "a-c": [10, 4]}
incorrect_ne_indices["17"] = {"u": [6, 11, 26, 30, 33, 35, 36, 39, 41],
                              "r": [37, 34, 27, 9, 7]}
incorrect_ne_indices["18"] = {"u": [9],
                              "a": [13, 13, 9]}
incorrect_ne_indices["19"] = {"u": [3],
                              "a": [3]}
incorrect_ne_indices["20"] = {"u": [1, 2],
                              "a": [4, 1, 1]}
incorrect_ne_indices["21"] = {"u": [1, 2]}
incorrect_ne_indices["22"] = {"u": [4, 6, 9, 10, 24, 25, 26, 27, 28, 29, 31],
                              "r": [33, 22, 14, 12, 8]}
incorrect_ne_indices["23"] = {"u": [2, 5, 8, 9, 11, 14],
                              "r": [1, 3, 6, 7, 12, 13, 16]}
incorrect_ne_indices["24"] = {"u": [0, 1, 3, 7, 9, 11, 13, 15, 17, 19, 21, 22, 23, 25, 27, 29, 38, 51, 54, 55, 57, 59],
                              "r": [60, 58, 41, 39, 35],
                              "a": [35],
                              "r-c": [30, 28, 26, 24, 18, 16, 14, 12, 10]}
incorrect_ne_indices["25"] = {"u": [1, 2, 6],
                              "r": [5, 4],
                              "a": [3, 2]}
incorrect_ne_indices["26"] = {"u": [4],
                              "r": [12],
                              "a": [9],
                              "r-c": [8],
                              "a-c": [7],
                              "r-c-c": [6, 5, 3]}
incorrect_ne_indices["27"] = {}
incorrect_ne_indices["28"] = {"u": [2, 3, 8, 11, 12],
                              "r": [13]}
incorrect_ne_indices["29"] = {"u": [1, 3],
                              "a": [4]}

##### Row idx 0

In [None]:
row_idx = 0
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 2
new_word = "Galiza"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any

# continue removing, if any

In [None]:
# update email dict
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 1

In [None]:
row_idx = 1
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 4
new_word = "Proxecto Fin de Carreira"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 2
sent_idx = 0
n_word = "Sala de Xuntas"
n_pseudonym = loc_pseudo
n_entity_group = loc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 2

In [None]:
row_idx = 2
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 2
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 3

In [None]:
row_idx = 3
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 4

In [None]:
row_idx = 4
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 6
new_word = "SPEA"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 8
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 9
new_word = "SPEA2"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 6
sent_idx = 4
n_word = "NSGAII"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 4
sent_idx = 2
n_word = "NSGAII"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 5
sent_idx = 2
n_word = "SPEA2"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 5

In [None]:
row_idx = 5
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 9
new_word = "Gabinete Juridico"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, loc_group, loc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 6

In [None]:
row_idx = 6
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 6
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 2
sent_idx = 1
n_word = "ees"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 7

In [None]:
row_idx = 7
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne

# add new ne, if any
insert_idx = 22
sent_idx = 11
n_word = "Estatuto"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any
df.at[row_idx, "detected_datetime"] = ["07 de mayo de 2013 12:52"]

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 8

In [None]:
row_idx = 8
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
new_word = "Enxeñería Informática"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 4
new_word = "Campus de Ourense"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 5
new_word = "nasassocialmedia"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 6
new_word = "Enxeñaría Informática"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 7
new_word = "Enxeñaría Informática"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 9
new_word = "nasassocialmedia"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 10
new_word = "nasassocialmedia"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 15
new_word = "Campus de Ourense"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 20
new_word = "nasassocialmedia"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 22
new_word = "nasassocialmedia"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 27
new_word = "Edificio Politécnico"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 8
sent_idx = 3
n_word = "LRU"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group, 
            eval(email["sentences"])[sent_idx], 1, ne_list, ne_sent)

# continue removing, if any
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 9

In [None]:
row_idx = 9
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 17
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

# add new ne, if any

# remove incorrect indices from ne_list and ne_sent of email

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 10

In [None]:
row_idx = 10
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
new_word = repeated_words[0]
update_ne_word(item_idx, new_word, ne_list)

item_idx = 19
new_word = "OX5 1GB"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 11

In [None]:
row_idx = 11
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 6
new_word = "eswa"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 9
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 11
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 13
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 18
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 20
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 21
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)


# add new ne, if any
insert_idx = 25
sent_idx = 21
n_word = repeated_words[0]
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 12
sent_idx = 12
n_word = "EES"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# add new ne, if any
insert_idx = 7
sent_idx = 6
n_word = "ees"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 2
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

# add new ne, if any
insert_idx = 4
sent_idx = 6
n_word = "ees"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 2
sent_idx = 3
n_word = repeated_words[0]
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 2
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 1
sent_idx = 3
n_word = repeated_words[0]
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 12

In [None]:
row_idx = 12
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 0
new_word = "Organización Académica, Profesorado e Titulacións"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 5
new_word = "Ingeniería Informática"
update_ne_word(item_idx, new_word, ne_list)

# add new ne, if any
insert_idx = 8
sent_idx = 1
n_word = "Vicerrectoría"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 9
sent_idx = 1
n_word = "Organización Académica, Profesorado e Titulacións"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 1
sent_idx = 0
n_word = "Reitoría"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# add new ne, if any
insert_idx = 0
sent_idx = 0
n_word = "Vicerreitoría"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 13

In [None]:
row_idx = 13
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne

# add new ne, if any

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 14

In [None]:
row_idx = 14
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 4
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 10
new_word = "Igrexa de Santa Eufemia"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 2
sent_idx = 0
n_word = "Edificio de Facultades"
n_pseudonym = loc_pseudo
n_entity_group = loc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 1
sent_idx = 0
n_word = "Vicerreitoría"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 15

In [None]:
row_idx = 15
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 11
new_word = "Fundación Empresa  Universidad Gallega FEUGA"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 13
new_word = "Rúa Lope Gómez de Marzoa"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 3
sent_idx = 3
n_word = "local A9"
n_pseudonym = loc_pseudo
n_entity_group = loc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)


insert_idx = 3
sent_idx = 3
n_word = "Área Comercial"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 16

In [None]:
row_idx = 16
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 9
new_word = "International Office"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 16
new_word = "ERASMUS"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 28
new_word = "Alemán"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 40
new_word = "Testdaf"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 42
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 44
new_word = "Francés"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 45
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 51
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

item_idx = 52
update_ne_group(item_idx, org_group, org_pseudo, ne_list)

item_idx = 55
new_word = "FIRST Cambridge"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 81
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 82
new_word = "bubela"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 84
new_word = "bubela"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 87
new_word = "uvigo"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 88
new_word = "uvigo"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 88
sent_idx = 21
n_word = "facebook"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 88
sent_idx = 21
n_word = "Facebook"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 60
sent_idx = 17
n_word = "francés"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 58
sent_idx = 17
n_word = "Bulats"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 2
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 56
sent_idx = 17
n_word = "ISE II"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 56
sent_idx = 17
n_word = "Trinity"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 55
sent_idx = 17
n_word = "Bulats"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 53
sent_idx = 16
n_word = "DELF"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)


insert_idx = 51
sent_idx = 15
n_word = "TCF"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 41
sent_idx = 13
n_word = "TDN"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 41
sent_idx = 13
n_word = "Niveaustufe"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 40
sent_idx = 13
n_word = "ISE I"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 40
sent_idx = 13
n_word = "Trinity"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

# add new ne, if any
insert_idx = 10
sent_idx = 3
n_word = "Oficina de Relacións Internacionais"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 4
sent_idx = 1
n_word = "alemana"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any
df.at[row_idx, "detected_datetime"] = ["08 de enero de 2013 11:11"]

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 17

In [None]:
row_idx = 17
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 6
new_word = "dcai"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 11
new_word = "Scientific Committee"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 26
new_word = "José M. Molina"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 30
update_ne_group(item_idx, "PER", "Arlo", ne_list)

item_idx = 33
new_word = "Andre Ponce de Leon F. de Carvalho"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Adri", ne_list)

item_idx = 35
new_word = "University of Sao Paulo"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 36
new_word = "Sao Carlos"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 39
update_ne_group(item_idx, "PER", "Marce", ne_list)

item_idx = 41
new_word = "dcai"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 18

In [None]:
row_idx = 18
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 9
new_word = "Scatter Search"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any
insert_idx = 13
sent_idx = 12
n_word = "GDV"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 2
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 13
sent_idx = 12
n_word = "GDV"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 9
sent_idx = 11
n_word = "EMOA"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 19

In [None]:
row_idx = 19
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 3
new_word = "Escuela Superior de Ingeniería Informática"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any
insert_idx = 3
sent_idx = 4
n_word = "Celso Campos"
n_pseudonym = "José"
n_entity_group = "PER"
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 20

In [None]:
row_idx = 20
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 2
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any
insert_idx = 4
sent_idx = 6
n_word = "PDA"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 1
sent_idx = 1
n_word = "POD"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 1
sent_idx = 1
n_word = "PDA"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 21

In [None]:
row_idx = 21
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
new_word = "Accessibles"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 2
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 22

In [None]:
row_idx = 22
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 4
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 6
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 9
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 10
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

for item_idx in [24, 25, 26, 27, 28]:
    update_ne_group(item_idx, org_group, org_pseudo, ne_list)

item_idx = 29
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 31
new_word = "Information Sciences"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 23

In [None]:
row_idx = 23
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 2
new_word = "José Ramón Méndez Reboredo"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 5
new_word = "Campus As Lagoas S/N"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 8
new_word = "OurenseOurenseEspaña Península"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 9
new_word = "José Ramón Méndez Reboredo"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "José", ne_list)

item_idx = 11
new_word = "Campus As Lagoas S/N"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 14
new_word = "OurenseOurenseEspaña Península"
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 24

In [None]:
row_idx = 24
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 0
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 1
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 3
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 7
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 9
new_word = "Méndez, J.R."
update_ne_word(item_idx, new_word, ne_list)

item_idx = 11
new_word = "Glez-Peña, D."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Alex", ne_list)

item_idx = 13
new_word = "Fdez-Riverola, F."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Ariel", ne_list)

item_idx = 15
new_word = "Díaz, F."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Cruz", ne_list)

item_idx = 17
new_word = "Corchado, J.M."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Fran", ne_list)

item_idx = 19
new_word = repeated_words[1]
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 21
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 22
update_ne_group(item_idx, "PER", "Angel", ne_list)

item_idx = 23
new_word = "Reboiro-Jato, M."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Adri", ne_list)

item_idx = 25
new_word = "Díaz, F."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Cruz", ne_list)

item_idx = 27
new_word = "Díaz, E."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Mati", ne_list)

item_idx = 29
new_word = "Fdez-Riverola, F."
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, "PER", "Ariel", ne_list)

item_idx = 38
new_word = "Elsevier B.V."
update_ne_word(item_idx, new_word, ne_list)

item_idx = 51
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 54
new_word = "elsevier"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 55
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 57
new_word = "Scopus"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 59
new_word = "Elsevier B.V."
update_ne_word(item_idx, new_word, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 35
sent_idx = 5
n_word = "CiteAlert"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 25

In [None]:
row_idx = 25
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
new_word = "SAI"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 2
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

item_idx = 6
update_ne_group(item_idx, loc_group, loc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 3
sent_idx = 1
n_word = "ciencias experimentais"
n_pseudonym = loc_pseudo
n_entity_group = loc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

insert_idx = 2
sent_idx = 0
n_word = "Ciencias del Mar y Bioloxía"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)


# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 26

In [None]:
row_idx = 26
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 4
new_word = "Elsevier Editorial System"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, misc_group, misc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any
insert_idx = 9
sent_idx = 15
n_word = "PDF"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c"])

# add new ne, if any
insert_idx = 7
sent_idx = 6
n_word = "ees"
n_pseudonym = misc_pseudo
n_entity_group = misc_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r-c-c"])

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 27

In [None]:
row_idx = 27
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 28

In [None]:
row_idx = 28
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 2
new_word = "Lei Orgánica"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 3
new_word = "Protección de datos de carácter persoal"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 8
new_word = "Axencia de Protección de Datos"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 11
update_ne_group(item_idx, loc_group, loc_pseudo, ne_list)

item_idx = 12
new_word = "Torre de Cristal de A Coruña"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, loc_group, loc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email
ne_list, ne_sent = remove_incorrect_ne_indices(ne_list, ne_sent, incorrect_ne_indices[str(row_idx)]["r"])

# add new ne, if any

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Row idx 29

In [None]:
row_idx = 29
email = df.iloc[row_idx]
check_email_lang(email["file_name"], email["lang"])
print_email(email)

In [None]:
# update incorrect ne manually, if any
ne_list = eval(email["ne_list"])
ne_sent = eval(email["ne_sent"])

In [None]:
# update incorrect ne
item_idx = 1
new_word = "Cristina Costas Varela"
update_ne_word(item_idx, new_word, ne_list)

item_idx = 3
new_word = "CACTI"
update_ne_word(item_idx, new_word, ne_list)
update_ne_group(item_idx, loc_group, loc_pseudo, ne_list)

# remove incorrect indices from ne_list and ne_sent of email

# add new ne, if any
insert_idx = 4
sent_idx = 2
n_word = "Universidade de Vigo"
n_pseudonym = org_pseudo
n_entity_group = org_group
occurrence = 1
add_new_ne(insert_idx, sent_idx, n_word, n_pseudonym, n_entity_group,
            eval(email["sentences"])[sent_idx], occurrence, ne_list, ne_sent)

# continue removing, if any

In [None]:
# update email in df
df.at[row_idx, "ne_list"] = ne_list
df.at[row_idx, "ne_sent"] = ne_sent

# get new pseudo content
updated_pseudo_content = get_new_pseudo_content(df, row_idx)
updated_pseudo_content

In [None]:
# update pseudo content in df
df.at[row_idx, "pseudo_content"] = updated_pseudo_content

In [None]:
# other updates, if any

In [None]:
# double check df using data wrangler

In [None]:
# save updated df to csv
df.iloc[[row_idx]].to_csv(f"../../../data/eval_data_200_eml_idx{row_idx}.csv", index=False)

##### Create final CSV file

In [None]:
from pathlib import Path
import pandas as pd

csv_folder = Path("../../../data/")
csv_files = [csv_folder / f"eval_data_200_eml_idx{i}.csv" for i in range(len(chosen_files))]
df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
checked_df = pd.concat(df_list, ignore_index=True)
checked_df.to_csv(f"../../../data/checked_eval_data_200_eml_{len(chosen_files)}_emails.csv", index=False)