In [1]:
import mailcom.inout
import mailcom.parse
import pandas as pd
import time
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# create t0 timestamp
t0 = time.time()

In [3]:
# import files from csv file
email_list = pd.read_csv("../mailcom/test/data/mails_lb_sg.csv")
print(email_list)

# create pseudonymization object
ps = mailcom.parse.Pseudonymize()
ps.init_spacy("fr")
ps.init_transformers()
# time stamp after model loading
t_model_loaded = time.time()

     Unnamed: 0                                            message
0           242  Von meinem iPhone gesendet Anfang der weiterge...
1           243  Von meinem iPhone gesendet Anfang der weiterge...
2           244  Von meinem iPhone gesendet Anfang der weiterge...
3           245  Von meinem iPhone gesendet Anfang der weiterge...
4           246  Von meinem iPhone gesendet Anfang der weiterge...
..          ...                                                ...
98         1313  \nVon: Mélissa des Presses de l'Université Lav...
99         1314  Von: Librairie Classiques Garnier &amp;lt;libr...
100        1315  La langue s'enrichit #36 - FranceTerme\nProf. ...
101        1316  Activités de juin - Presses de l'Université La...
102        1317  Nouveautés de juin\nProf. Dr. Sybille Große I ...

[103 rows x 2 columns]


Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# loop over mails and pseudonymize them
out_list = []
ts_list = []
for idx, row in email_list.iterrows():
    text = row["message"]
    email_dict = {"content": text}
    if not text:
        continue
    # Test functionality of Pseudonymize class
    output_text = ps.pseudonymize(text)
    email_dict["pseudo_content"] = output_text
    out_list.append(email_dict)

    # timestamp after this email
    ts_list.append(time.time())

In [5]:
# write output to pandas df
df = pd.DataFrame(out_list)
print(df)

                                               content  \
0    Von meinem iPhone gesendet Anfang der weiterge...   
1    Von meinem iPhone gesendet Anfang der weiterge...   
2    Von meinem iPhone gesendet Anfang der weiterge...   
3    Von meinem iPhone gesendet Anfang der weiterge...   
4    Von meinem iPhone gesendet Anfang der weiterge...   
..                                                 ...   
98   \nVon: Mélissa des Presses de l'Université Lav...   
99   Von: Librairie Classiques Garnier &amp;lt;libr...   
100  La langue s'enrichit #36 - FranceTerme\nProf. ...   
101  Activités de juin - Presses de l'Université La...   
102  Nouveautés de juin\nProf. Dr. Sybille Große I ...   

                                        pseudo_content  
0    Von meinem [misc] gesendet Anfang der weiterge...  
1    Von meinem [misc] gesendet Anfang der weiterge...  
2    Von meinem [misc] gesendet Anfang der weiterge...  
3    Von meinem [misc] gesendet Anfang der weiterge...  
4    Von meinem [m

In [13]:
# print timestamps
print("Time from start to model loaded:", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), "s")
# time differences between emails
ts_diffs = []
for i in range(0, len(ts_list)):
    if i == 0:
        ts_diff = (ts_list[i] - t_model_loaded)
    else:
        ts_diff = (ts_list[i] - ts_list[i-1])
    ts_diffs.append(ts_diff)
    print("Time needed for email", i, ":", (datetime.datetime.fromtimestamp(ts_diff).strftime('%S')), "s")
print("Total time:", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1] - t_model_loaded).strftime('%M:%S')))

Time from start to model loaded: 08 s
Time needed for email 0 : 06 s
Time needed for email 1 : 08 s
Time needed for email 2 : 05 s
Time needed for email 3 : 09 s
Time needed for email 4 : 06 s
Time needed for email 5 : 13 s
Time needed for email 6 : 08 s
Time needed for email 7 : 06 s
Time needed for email 8 : 14 s
Time needed for email 9 : 08 s
Time needed for email 10 : 09 s
Time needed for email 11 : 09 s
Time needed for email 12 : 12 s
Time needed for email 13 : 11 s
Time needed for email 14 : 08 s
Time needed for email 15 : 03 s
Time needed for email 16 : 01 s
Time needed for email 17 : 18 s
Time needed for email 18 : 07 s
Time needed for email 19 : 03 s
Time needed for email 20 : 27 s
Time needed for email 21 : 05 s
Time needed for email 22 : 04 s
Time needed for email 23 : 07 s
Time needed for email 24 : 05 s
Time needed for email 25 : 05 s
Time needed for email 26 : 05 s
Time needed for email 27 : 01 s
Time needed for email 28 : 03 s
Time needed for email 29 : 04 s
Time needed 