# Build Email Corpus UNLABELD for AI Training 

Author:     Andreas Barth, SF6-S-OG  
Version:    1.0, 3.9.2020  

Platform:   HSDAP  

Purpose:    Script to consolidate numerous Dataframes to a Dataframe Email-Corpus with unlabeled Email-Data.
            Preprocessing: Decoding, Blacklist-Filtering, Data-Cleansing/Anonymisation

Data:       Emails from BMW Bank Outlook Mailserver bmw.bank@bmw.de, persisted in pickled dataframes on HSDAP

Output:     Consolidated & cleansed Email-Corpus, persisted in pickled Dataframe for further model training
  
Steps:  

0. Imports & Functions  
1. Konsolidierten Dataframe erstellen
2. Enkodierung ermitteln und Dekodieren
3. Blacklisting-Filter
4. Data Cleaning / Anonymisierung


In [1]:
import os

# ==================== BMW-Bank Funktionen ============================
workDir = os.getcwd()
os.chdir('/home/q506010/0_Packages')
from BmwBankTools.downloadTools import * 
from BmwBankTools.cleanEmails import * 
os.chdir(workDir)

# ==================  Settings  =====================================
from IPython.core.interactiveshell import InteractiveShell
%matplotlib inline
pd.set_option('display.max_colwidth', 200)
np.random.seed(4711)
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('ggplot')
%colors LightBG
LINIE = "-"*150
# ==================  Print to Console ==============================
print(sys.version, sys.getdefaultencoding())
print("Spacy Version:", spacy.__version__)
os.getcwd()

3.7.7 (default, May  7 2020, 21:25:33) 
[GCC 7.3.0] utf-8
Spacy Version: 2.3.1


'/home/q506010'

***
### Execute

### 1. Konsolidierten Dataframe erstellen

+ Alle Dataframes einzeln laden und mit df.append zusammenfassen
    + KW27: 5.7.2020; 5079 Records mit 16 Spalten // Keine validen Files-Informationen, keine SpecFiles Spalten
    + KW28: 12.07.2020; 4929 Records mit 18 Spalten
    + KW29: 16.07.2020; 603 Records mit 18 Spalten // Ab dieser Woche Umstellung auf tägl. Reset der Mailbox, daher "nur" 603 Datensätze für KW29 
    + Ab 20.7. tägliches Clearing des "Importiert Ordners" auf bmw.bank@bmw.de

In [None]:
dataDir = '/home/q506010/1_ReadEmails'
os.chdir(dataDir)

FL = sorted([f for f in os.listdir() if ".pkl" in f
            and "Mailcontainer_" in f and "-2020" in f])

FL.remove('Mailcontainer_05-07-2020.pkl')
df = pd.read_pickle('Mailcontainer_05-07-2020.pkl')
df.shape

for f in FL:
    tmp = pd.read_pickle(f)
    df = df.append(tmp)      
    print(f"Adding {tmp.shape[0]} records from {f[-14:]}. Total = {df.shape[0]}")

df.shape
doubles = df.duplicated("MESSAGE_ID").sum()
print(f"Dubletten: {doubles}")
df.drop_duplicates("MESSAGE_ID", inplace=True)
df.shape; df.info()

In [None]:
df.to_pickle("EmailCorpus_UL_2909.pkl")

***
### 2. MC_ALL PreProcessing

2.0 MC_ALL Raw-Format laden (.pkl) & redundante Spalten entfernen (#FILES, MID, TYPE, UMID,TMSTP, CTE, )

2.1 Irrelevante Datensätze entfernen / BLACKLISTING FILTER

2.3 Email-Body DECODING
+ Encoding Detection für alle Emails mit ENCODING == "unknown"
+ Update der ENCODING-Spalte mit der Information aus der Encoding Detection
+ Decoding des Email-Body's anhand der Encoding-Info aus der Spalte ENCODING => Neue Spalte BODY_DC

2.4 Email-Cleansing
+ Replace: Anrede und Gruß/Abschied ersetzen
+ Replace: .replace("\r\n", " ")
+ Replace: .replace("\t", " ")



#### 2.0 Email-Corpus im Raw-Format laden (.pkl) & redundante Spalten entfernen
Irrelevante Spalten: (#FILES, MID, TYPE, UMID,TMSTP, CTE, )

In [None]:
[f for f in os.listdir() if "EmailCorpus_" in f and "pkl" in f]

In [None]:
corpus = "EmailCorpus_UL_2909.pkl"
df = pd.read_pickle(corpus)
df.reset_index(inplace=True)
DropCols = ["index","MID", "TYPE", "TO", "CC", "NBR_FILES", "FILES", "UMID", "TMSTMP", "CTE"]
df.drop(DropCols, axis=1, inplace=True)
df.shape; df.head()

#### 2.1 Irrelevante Datensätze entfernen
+ Blacklist Filter
+ Sonstige Absender ausfiltern
+ Sonstige irrelevante Einträge ausfiltern (best. Betreffzeilen)


In [None]:
# 2.1 (a) BLACKLIST FILTER

df.shape
BLACKLIST = loadBlacklist()
# [j for i,j in enumerate(BLACKLIST) if i <5]

filter_BL = df.FROM.apply(lambda x: checkBlacklist(x, BLACKLIST))
print(f"Blacklistcheck hat {filter_BL.sum()} kritische Absender ermittelt")

df = df.loc[filter_BL == False,:]
print(f"{filter_BL.sum()} Kritische Absender entfernt, {filter_BL.mean():.1%}"); df.shape

In [None]:
# 2.1 (b) Sonstige Absender ausfiltern

# meinfs@bmw.de             => Onlineformular
# Empfangbank@bmw.de        => Eingescanntes Eingangsfax als Anhang
# PkwPartnerWebsite@bmw.de  => Mitteilung einer Adressï¿½nderung
# werkstatt@11880.com       => Websitegenerierte Anfrage, Body nicht lesbar
# SF6-RPA-PROD@bmw.de       => Durch BMW Bank Roboter generiert
# no-reply@bmw.com          => User Request Data from BMW Website
# rsv_vertrag@bmw.de        => RSV-Kündigungen oder -Widerrufe von Credit Life
# FSExtranet@bmw.de         => Anforderung Ablöseangebot Fremdbestand
# noreply-bmw.bank@bmw.de   => Autom. Antwort von BMW Bank "Ihre Email ist eingegangen, ..."
# kundenbetreuung@bmw.de

notRelevantSenders = set(['meinfs@bmw.de', 'NoReply Kundenportal Mein BMW Financial Services <meinfs@bmw.de>',
                      'Empfangbank@bmw.de', 'Empfangbank <Empfangbank@bmw.de>',
                      '<bmw.bank@bmw.de>',
                      'Mail Delivery System <noreply@ces.cisco.com>',
                      'Microsoft Outlook\r\n\t<MicrosoftExchange329e71ec88ae4615bbc36ab6ce41109e@bmwmail.corp>',
                      'PkwPartnerWebsite@bmw.de', '<PkwPartnerWebsite@bmw.de>',
                      'werkstatt@11880.com',
                      'SF6-RPA-PROD@bmw.de', 'SF6-RPA-PROD <SF6-RPA-PROD@bmw.de>',
                      'no-reply@bmw.de','noreply-bmw.bank@bmw.de','"noreply-bmw.bank@bmw.de" <noreply-bmw.bank@bmw.de>',
                      'BMW Website <no-reply@bmw.com>',
                      'rsv_vertrag@bmw.de', 'rsv_vertrag <rsv_vertrag@bmw.de>',
                      '"Tagesspiegel Morgenlage Politik" <politik@morgenlage.tagesspiegel.de>',
                      'FSExtranet@bmw.de',
                      'kundenbetreuung@bmw.de', 'BMW Kundenbetreuung <kundenbetreuung@bmw.de>',
                     ])

dropRows = df.FROM.isin(notRelevantSenders)
dropRows.sum()
df = df.loc[dropRows == False,:]; df.shape

In [None]:
# # 2.1 (c) Emails mit irrelevanten Betreffzeilen ausfiltern:      
filter_1 = set(df[df.SUBJECT == "Kontaktanfrage Internet Auftritt Banking"].index);    len(filter_1)
filter_2 = set(df[df.SUBJECT == "Eingang einer Kontaktanfrage"].index);                len(filter_2)
in_scope = set(df.index).difference(filter_1, filter_2);  

df = df.loc[in_scope,:]                                                          
df.shape

In [None]:
dfSIK = df.copy()

#### 2.2 Dekodierung
+ Encoding ermitteln
+ Dekodierung des Email-Body

In [None]:
# 2.2 (a) Encoding Detection
print(f"Ermittle Encoding für {df.shape[0]} Datensätze ...")
df = detectEncodingDF(df)
df.shape
dfSIK2 = df.copy()
print(f"... finished")

In [None]:
filename = "EmailCorpus_UL_2909_ENC.pkl"
dfSIK2.to_pickle(filename)

In [None]:
dataDir = '/home/q506010/1_ReadEmails'
os.chdir(dataDir)
filename = "EmailCorpus_UL_2909_ENC.pkl"
df = pd.read_pickle(filename)

In [None]:
df.FROM.value_counts()[:20]

In [None]:
df.shape
df.ENCODING.value_counts(dropna=False)
df.ENCPROPOSAL.value_counts(dropna=False)
(df.ENCPROPOSAL.str.upper() == df.ENCODING.str.upper()).sum()

In [None]:
# 2.2 (b) Dekodierung
# df = pd.read_pickle("EmailCorpus_EncDetected.pkl")
df = decodeEmailCorpus(df)
df.shape

In [None]:
df.BODY_DC.sample(5)

In [None]:
dfSIK2 = df.copy()
fn = "EmailCorpus_UL_2909_DECODED.pkl"
dfSIK2.to_pickle(fn)

### 3. Anonymisierung

In [2]:
dataDir = '/home/q506010/1_ReadEmails'
os.chdir(dataDir)
fn = "EmailCorpus_UL_2909_DECODED.pkl"

df = pd.read_pickle(fn)
df = df.iloc[:,[1, 2, 3, 4, 12, 6, 7, 8 ]].copy()
df.columns = ['DATE', 'TIME', 'FROM', 'SUBJECT', 'BODY', 'MESSAGE_ID', 'SPECFILES', 'NBR_SPECFILES']

# df.SUBJECT.isna().sum()
# df.SUBJECT.dtype
# df.BODY.isna().sum()
# df.BODY.dtype

In [None]:
df.BODY.str.split().str.len().describe(percentiles=[.9, .95, .99])

In [3]:
max_BodyLength = 10_000
df.BODY = df.BODY.apply(lambda txt: txt[:max_BodyLength])
df.BODY.str.len().describe(percentiles=[.9, .95, .99])

count    28997.000000
mean      2105.188088
std       2404.611110
min          0.000000
50%       1223.000000
90%       5663.000000
95%       7837.200000
99%      10000.000000
max      10000.000000
Name: BODY, dtype: float64

In [4]:
start = dt.datetime.now()

df = CleanREPLACE(df)

# Regex Anonymizer Funktion aufrufen
df.SUBJECT      = df.SUBJECT.apply(CleanRGX)
df.BODY_CLEAN   = df.BODY_CLEAN.apply(CleanRGX)

# Spacy Anonymizer Funktion aufrufen
df.SUBJECT    = df.SUBJECT.apply(CleanNER)
df.BODY_CLEAN = df.BODY_CLEAN.apply(CleanNER)
print(dt.datetime.now()-start)



0:22:39.121914


In [5]:
fn = "EmailCorpus_UL_2909_ANONYMIZED.pkl"
df.to_pickle(fn)

In [None]:
df2.BODY.str.len().describe(percentiles=[.8, .9, .95, .99])

### Work on Sliced Dataframe for memory efficiency

In [None]:
dataDir = '/home/q506010/1_ReadEmails'
os.chdir(dataDir)
fn = "EmailCorpus_UL_2909_TEMP1.pkl"

slice1 = 10_000
slice2 = 20_000

df = pd.read_pickle(fn)
df2 = df.iloc[:slice1,:].copy()
df2.info()

del df

In [None]:

df["BODY_CLEAN"] = df.BODY_CLEAN.apply(CleanNER)

duration = dt.datetime.now()-start
df.info()

In [None]:
df.loc[df.SUBJECT=="ANONYMIZATION FAILED", :].shape[0]

# Parkplatz

In [None]:
df.SUBJECT.str.len().mean()                                           # Durchschnittliche Anz. Zeichen im Email-Betreff
df.SUBJECT.str.split().apply(lambda wortliste: len(wortliste)).mean() # Durchschnittliche Anz. Wörter im Email-Betreff

df.BODY.str.len().mean()                                              # Durchschnittliche Anz. Zeichen im Email-Body
df.BODY.str.split().apply(lambda wortliste: len(wortliste)).mean()    # Durchschnittliche Anz. Wörter im Email-Body

In [None]:
searchstring = 'Microsoft Outlook\r\n\t<MicrosoftExchange329e71ec88ae4615bbc36ab6ce41109e@bmwmail.corp>'
df.loc[df.FROM.str.contains(searchstring),["FROM","SUBJECT","BODY"]].sample(5)

In [None]:
pd.set_option('display.max_colwidth', 200)
df.BODY_DC.sample(50)

In [None]:
saveMAILCONTAINER_2disk(MC_ALL, filename="MC_ALL_INCL_KW29_" )

In [None]:
MC1.BODY.sample(3)

In [None]:
MC1["ENCTMP"] = MC1.BODY.apply(lambda body: detectEncodingTXT(body)) 

In [None]:
MC2 = MC.copy()
MC2 = detectEncodingDF(MC2)

# MC1[]

In [None]:
privDom = ["eplus.de", "kabelmail.de", "aol.com", "aol.de", "freenet.de","t-online.de", "telekom.de" "arcor.de", "kabel.de", "ionos.de", "gmail.de", "gmail.com",
           "icloud.de", "icloud.com", "gmx.de", "gmx.com", "web.de", "yahoo.de", "yahoo.com", "outlook.de", "outlook.com", "mail.de"]

filter_privDom = MC.FROM.apply(lambda sender: any([dom in str.lower(sender) for dom in privDom]))

bmwDom = ["bmw.de", "partner.bmw.de"]
filter_bmwDom = MC.FROM.apply(lambda sender: any([dom in str.lower(sender) for dom in bmwDom]))

MC[filter_privDom].shape
MC[filter_bmwDom].shape
             
MC.FROM[filter_bmwDom].value_counts()[:20]

In [None]:
MC.FROM.apply(lambda sender: "meinfs@bmw.de" in sender).sum()

In [None]:
MC.FROM[MC.FROM.apply(lambda sender: any("meinfs@bmw.de" in ].shape

In [None]:
from spacy.lang.de.punctuation import PUNCT, LIST_PUNCT
punct = PUNCT
punct