In [47]:
# importing lib

import pandas as pd

import numpy as np

import os

import re

In [48]:
# importing data

cwd = os.getcwd()

file_name = 'data.xlsx'

path = os.path.join(cwd, file_name)

df = pd.read_excel(path)

df.head()

Unnamed: 0,Field1
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...
1,CANCEL/SUSPEND EXPIRE(02/13/21)
2,PRIVILEGES AUDIT GEN
3,ACCESS ACC-CNT(5) ACC-DATE(02/02/...
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...


In [49]:
# keeping fields that contain id, password, and stat info

df['keep'] = df['Field1'].apply(lambda x: re.match(r'^ID.+|^PASSWORD.+|^STATISTICS.+', x))

df.head()

Unnamed: 0,Field1,keep
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,"<re.Match object; span=(0, 79), match='ID JANE..."
1,CANCEL/SUSPEND EXPIRE(02/13/21),
2,PRIVILEGES AUDIT GEN,
3,ACCESS ACC-CNT(5) ACC-DATE(02/02/...,
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,"<re.Match object; span=(0, 107), match='PASSWO..."


In [50]:
# filtering out none from 'keep'

df_filtered = df[~pd.isna(df['keep'])][['Field1']]

df_filtered.head()


Unnamed: 0,Field1
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...
6,STATISTICS CRE-TOD(02/01/21-08:19) SE...
8,ID JEFFSAL EXT.411 SALESEXTJEFFS...
12,PASSWORD PSWD-DAT(01/01/23) PSWD-TO...


In [51]:
# parsing id name from field 1

df_filtered['id'] = df_filtered['Field1'].apply(lambda x: re.findall(r'ID (.+?)\s', x))

df_filtered.head()

Unnamed: 0,Field1,id
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,[JANEAUD]
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,[]
6,STATISTICS CRE-TOD(02/01/21-08:19) SE...,[]
8,ID JEFFSAL EXT.411 SALESEXTJEFFS...,[JEFFSAL]
12,PASSWORD PSWD-DAT(01/01/23) PSWD-TO...,[]


In [52]:
# converting list

def list_converter(col):
    
    for x in col:
        
        if x:
            return x
        else:
            return None

In [53]:
# applying function

df_filtered['id'] = df_filtered['id'].apply(list_converter)

df_filtered.head()

Unnamed: 0,Field1,id
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,
6,STATISTICS CRE-TOD(02/01/21-08:19) SE...,
8,ID JEFFSAL EXT.411 SALESEXTJEFFS...,JEFFSAL
12,PASSWORD PSWD-DAT(01/01/23) PSWD-TO...,


In [54]:
# splitting field on space and tokenizing to rows

df_filtered['Field1_tokenized'] = df_filtered['Field1'].str.split(r'[\s]+')

df_explode = df_filtered.explode('Field1_tokenized')

df_explode.head()

Unnamed: 0,Field1,id,Field1_tokenized
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT)


In [55]:
df_explode.head(10) # nulls in id field

Unnamed: 0,Field1,id,Field1_tokenized
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT)
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,FUNCTION(AUD)
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PASSWORD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-DAT(02/01/21)
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-TOD(02/01/21-13:23)


In [56]:
# populating nulls in id field

id_new = []

for index, row in df_explode.iterrows():
    
    if pd.isna(row['id']):
        id_new.append(id_new[-1])
        
    else:
        id_new.append(row['id'])

In [57]:
# attaching new id field to df

df_explode['id_new'] = id_new

df_explode.head(10)

Unnamed: 0,Field1,id,Field1_tokenized,id_new
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD,JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT),JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,FUNCTION(AUD),JANEAUD
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PASSWORD,JANEAUD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-DAT(02/01/21),JANEAUD
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-TOD(02/01/21-13:23),JANEAUD


In [58]:
# parsing password violations from field1

df_explode['password_violations'] = df_explode['Field1_tokenized'].apply(lambda x: re.findall(r'PSWD-VIO\((\d)\)', x))

df_explode.head()

Unnamed: 0,Field1,id,Field1_tokenized,id_new,password_violations
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID,JANEAUD,[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD,[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410,JANEAUD,[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD,JANEAUD,[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT),JANEAUD,[]


In [59]:
# parsing security violations

df_explode['security_violations'] = df_explode['Field1_tokenized'].apply(lambda x: re.findall(r'SEC-VIO\((\d)\)', x))

In [60]:
df_explode.head()

Unnamed: 0,Field1,id,Field1_tokenized,id_new,password_violations,security_violations
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID,JANEAUD,[],[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD,[],[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410,JANEAUD,[],[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD,JANEAUD,[],[]
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT),JANEAUD,[],[]


In [61]:
# applying list converter

df_explode['password_violations'] = df_explode['password_violations'].apply(list_converter)

df_explode['security_violations'] = df_explode['security_violations'].apply(list_converter)

In [62]:
df_explode.head() 

Unnamed: 0,Field1,id,Field1_tokenized,id_new,password_violations,security_violations
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID,JANEAUD,,
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD,,
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410,JANEAUD,,
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD,JANEAUD,,
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT),JANEAUD,,


In [63]:
# creating a function to convert cols to numeric

def multi_field (df, col1, col2):
    
    for x in (col1, col2):
        s = df[x]
    
        df[x] = [int(x) if x else int(0) for x in s]
        
    return df

In [66]:
# applying function

df_final = multi_field(df_explode, 'password_violations', 'security_violations')

In [67]:
df_final

Unnamed: 0,Field1,id,Field1_tokenized,id_new,password_violations,security_violations
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ID,JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,EXT.410,JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,ITAUDJANEAUD,JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,DEPT(IT),JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,FUNCTION(AUD),JANEAUD,0,0
0,ID JANEAUD EXT.410 ITAUDJANEAUD ...,JANEAUD,JANEAUD,JANEAUD,0,0
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PASSWORD,JANEAUD,0,0
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-DAT(02/01/21),JANEAUD,0,0
4,PASSWORD PSWD-DAT(02/01/21) PSWD-TO...,,PSWD-TOD(02/01/21-13:23),JANEAUD,0,0


In [70]:
# aggregating security and password violations by id

df_final.groupby('id_new').agg(password_violations = ('password_violations', 'sum'), security_violations = ('security_violations', 'sum'))

Unnamed: 0_level_0,password_violations,security_violations
id_new,Unnamed: 1_level_1,Unnamed: 2_level_1
JANEAUD,1,1
JEFFSAL,3,1
NICOPM,0,0
