# READ KOFAX EXPORT DATA (DREAM)

Author:   Andreas Barth, SF6-S-OG  
Version:  4, 12.10.2020  

Script to read the email informations (doctypes, body, subject, batchID, pagecount, etc.) out of Export Scripts (File Structure) provided by KOFAX export.

HSDAP ONLY! ... (Raw Data is not anonymized yet)


Steps:
  
0. Imports & Functions
1. Read RawData into Dataframe
2. Transform RawData: Blacklist Check, Cleaning, Anonymization & Preprocessing
3. Save Cleaned Data to Disk (Pickle)


### 0. Imports & Functions

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os, re, string, sys
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm

# ==================== BMW-Bank Funktionen ============================
workDir = os.getcwd()
os.chdir('/mnt/hsdapnas01/shared/sf6/0_Tools')
from BmwBankTools.cleanEmails import * 
os.chdir(workDir)

# ==========================================================
%matplotlib inline
pd.set_option('display.max_colwidth', 500)
np.random.seed(4711)
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('ggplot')
#===========================================================
print(sys.version, sys.getdefaultencoding())
os.getcwd()

***
### 1. Read KOFAX exported data from file structure on disk (HSDAP)
#### 1.1 Create Listing with filepath of each file to be read into system

In [None]:
# exportDir = '/mnt/hsdapnas01/shared/sf6/1_TrainingData/'
dataDir   = '/mnt/hsdapnas01/shared/sf6/1_RawData/'
DirectoryList = [f for f in os.listdir(dataDir) if "Learning" in f and "zip" not in f]
print(DirectoryList)

In [None]:
targetDir = DirectoryList[0]                  # Choose directory with rawdata that is to be read into dataframes
targetDir = os.path.join(dataDir, targetDir)
searchstring = ".txt"
fileList = [os.path.join(dirpath, filename) for dirpath, dirname, files in os.walk(targetDir) for filename in files if filename.endswith(searchstring)]
print("# Files: ", len(fileList)); fileList[:6]

#### 1.2 Read RawData into Pandas Dataframe

In [None]:
# BITTE NUR ZUM TESTEN BENUTZEN ... SONST RAUSNEHMEN ODER ÜBERSPRINGEN
fileList = fileList[:10_000]   

In [None]:
# Create 2 empty Pandas data frames: 1 to collect the content from body files and 1 for the index files   
# DF for text files with bodies
cols = "file_body rawBody".upper().split()
df_BODY = pd.DataFrame(columns=cols)
# DF for indexfiles
cols = "file_index indexString".upper().split()  
df_INDEX = pd.DataFrame(columns=cols)

# helpers
ERRORS = []
START_TIME = dt.datetime.now()
COUNT = 0

# Reading files from Listing
for filepath in tqdm(fileList, desc="Collecting Documents"):
    COUNT += 1
    fn = filepath.split("/")[-1].split(".")[0]
    with open(filepath,"r", encoding="utf-8") as f:
        content = f.read()
        
    # Read Classification Infos from Indexfiles
    if "_index" in fn:
        try:
            idx = fn.split("_")[0]             # Gleichen Index verwenden wie für das Bodyfile (Emailtext)
            df_INDEX.loc[idx,:] = filepath.split("/")[-1], content 
        except:
            ERRORS.append(filepath)
        
    # Read Text from Textfiles
    else:
        idx = fn
        df_BODY.loc[idx,:] = filepath.split("/")[-1], content 

# Concatenate the 2 dataframes to one 
df = pd.concat([df_INDEX, df_BODY], axis=1)
df.dropna(axis=0, subset=["INDEXSTRING","RAWBODY"], inplace=True)

df.INDEXSTRING = df.INDEXSTRING.str.replace("\ufeff", "")

DURATION = dt.datetime.now() - START_TIME
print(f"{len(ERRORS)} Indexfiles created errors and were not read into dataframe")
print(f"Processing of {COUNT} files took {DURATION.seconds} seconds")

df.shape; df.head(3)
dfSIK = df.copy()
del df_INDEX, df_BODY
df.RAWBODY.sample(5)

#### 1.3 Filter for records that contain an AutoClassifiction-Tag in the Indexstring

In [None]:
# Check how many?
print("# of Records: ", df.INDEXSTRING.str.contains("AutoClassificationConfidence").sum())
print("% of Records with AutoClass Info: ",df.INDEXSTRING.str.contains("AutoClassificationConfidence").mean().round(3))

# Filter dataframe for records with AutoClassification Tag only
df = df.loc[df.INDEXSTRING.str.contains("AutoClassificationConfidence")==True,:].copy()
print("Filtered Dataframe, Shape: ", df.shape)

#### 1.4 Read all necessary information from INDEXSTRING

In [None]:
df = dfSIK.copy()

# Create columns using the information provided in the indexstring
df["BATCHKLASSE"]      = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][0].replace('"',''))
df["BATCHCONTENT"]     = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][1].replace('"',''))
df["BATCHID"]          = df.INDEXSTRING.apply(lambda x: int([*x.split(',')][3].replace('"','')))
df["DOCID"]            = df.INDEXSTRING.apply(lambda x: int([*x.split(',')][5].replace('"','')))
df["DOCTYPE"]          = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][7].replace('"',''))
df["CONFIDENCE"]       = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][9].replace('"',''))
df["AUTOCLASS"]        = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][19].replace('"',''))
df["PAGECOUNT"]        = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][11]).str.strip('"')
df["DOCCOUNT"]         = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][13]).str.strip('"')
df["INPUTCHANNEL"]     = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][15].replace('"',''))
df["SOURCESYSTEM"]     = df.INDEXSTRING.apply(lambda x:     [*x.split(',')][17].replace('"',''))

# Convert dtypes to numeric (int) and boolean (T/F)
df.PAGECOUNT  = pd.to_numeric(df.PAGECOUNT, errors='coerce').astype("Int64")
df.PAGECOUNT  = df.PAGECOUNT.fillna(df.PAGECOUNT.median())
df.DOCCOUNT   = pd.to_numeric(df.DOCCOUNT, errors='coerce').astype("Int64")
df.DOCCOUNT   = df.DOCCOUNT.fillna(df.DOCCOUNT.median())
df.AUTOCLASS  = pd.to_numeric(df.AUTOCLASS)
print("NA Values auf AUTOCLASS: ", df.AUTOCLASS.isna().sum())
df.dropna(axis=0, subset=["AUTOCLASS"], inplace=True)
df.AUTOCLASS  = df.AUTOCLASS.astype("bool")

# Split Information contained in CONFIDENCE by | Separator into list
df.CONFIDENCE = df.CONFIDENCE.str.split("|")
df["NBR_DOCTYPES"] = df.CONFIDENCE.apply(lambda x: len(x))

# Drop NA records
print("NA Values auf DOCTYPE: ", df.DOCTYPE.isna().sum())
print("NA Values auf BODY: ", df.RAWBODY.isna().sum())
df = df.dropna(axis=0, subset=["DOCTYPE", "RAWBODY"])
print("Dataframe, Shape: ", df.shape)

dfSIK2 = df.copy()

#### 1.5 Optional Save RawData before further Processing

In [None]:
filename = "FILENAME.pkl"
df.to_pickle(dataDir+filename)


#### 1.6 Optional: Explorative Data Analysis

In [None]:
df.shape
df.PAGECOUNT.sum()                                     # number of total pages
df.DOCTYPE.nunique()
df.AUTOCLASS.value_counts(normalize=True).round(3)     # records with AutoClass Info contained
df.AUTOCLASS.value_counts(normalize=False) 

***
### 2. Clean KOFAX Data for processing on DLP platform

+ Filter for AUTOCLASSIFICATION == FALSE
+ Blacklist-Filtering of data
+ Cleaning & Anonymization of data
+ Limit length of each document to a maximum length value (shortens very long documents)
+ Select columns necessary for DLP (exclude all raw data columns!!!)
+ Write to disk with "\*.pkl" format



#### 2.1 Filter for records with tag "AUTOCLASSIFICATION" set to FALSE

In [None]:
dfM = df.loc[df.AUTOCLASS==False,:].copy()    #Filter for records with tag "AUTOCLASSIFICATION" set to FALSE
print(f"Anz. Dokumente die nicht autom. klassifiziert werden: # {dfM.shape[0]}")
print(f"Anz. Seiten, die nicht autom. klassifiziert werden: # {dfM.PAGECOUNT.sum()}")
dfM.shape
dfM.DOCTYPE.nunique()
dfM.PAGECOUNT.sum()
dfM.AUTOCLASS.value_counts()

#### 2.2 Blacklist Filter

In [None]:
tqdm.pandas(desc="Matching with Blacklist")

dfC = dfM.copy()

t0 = dt.datetime.now()
blackListFile = '/mnt/hsdapnas01/shared/sf6/0_Tools/BlacklistEmail.pkl' 
BLACKLIST = loadBlacklist(blackListFile)
print(f"Blacklist with {len(BLACKLIST)} records loaded")
print("Checking against Blacklist ...")

# Conditional Check Column
# dfC["BL"] = dfC.RAWBODY.apply(lambda x: checkBlacklist(x, BLACKLIST))
dfC["BL"] = dfC.RAWBODY.progress_apply(lambda x: checkBlacklist(x, BLACKLIST))

dur = dt.datetime.now() - t0
print(f"Blacklist Filter took: {dur.seconds} seconds, identified {dfC.BL.sum()} records overlapping with Blacklist")

dfC = dfC.loc[dfC.BL==False, :]
dfC.shape
dfCSIK = dfC.copy()

#### 2.2b Optional Save Blacklist-Checked Data before further Processing

In [None]:
filename = "FILENAME.pkl"
dfC.to_pickle(dataDir+filename)

In [None]:
dfC.DOCTYPE.nunique()
dfC.AUTOCLASS.nunique()
dfC.PAGECOUNT.sum()
dfC.shape

#### 2.3 Cleaning & Anonymization

In [None]:
# Cleaning
time0 = dt.datetime.now()
dfC = CleanREPLACE_KOFAX_Export(dfC)

# Cleaning Stufe 2 (Regex Rules)
tqdm.pandas(desc="Run Regex Rules for Anonymization & Cleaning")
time1 = dt.datetime.now()
dfC.BODY_CLEAN = dfC.BODY_CLEAN.progress_apply(CleanRGX)

# Setting Maximum Text Length before Spacy NER Function is applied
textsize = 10_000
tqdm.pandas(desc=f"Trimming all Documents to max length of {textsize} characters")
dfC.BODY_CLEAN = dfC.BODY_CLEAN.progress_apply(lambda txt: txt[:textsize])

time2 = dt.datetime.now()
tqdm.pandas(desc="NER Detection & Anonymization")
dfC.BODY_CLEAN = dfC.BODY_CLEAN.progress_apply(CleanNER)

time3 = dt.datetime.now()
d1 = time3-time2
d2 = time3-time0
d1.seconds; d2.seconds

dfC.shape
dfC.DOCTYPE.nunique()
dfC.BODY_CLEAN.sample(5)

#### 2.4 Save Data to Disk

In [None]:
dfC.info()

In [None]:
filename = "LXX_v0.pkl"
exportDir = '/mnt/hsdapnas01/shared/sf6/1_TrainingData/' 

filter_  = dfC.BODY_CLEAN.str.contains("ANONYMIZATION FAILED")
exportDF = dfC.loc[filter_==False].copy()
exportDF = exportDF.iloc[:,[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17]]

exportDF.to_pickle(exportDir+filename)
exportDF.shape
exportDF.DOCTYPE.nunique()
exportDF.PAGECOUNT.sum()

In [None]:
exportDF.info()

## Parkplatz