In [2]:
import pandas as pd
import numpy as np 
import json
import xml.etree.ElementTree as ET

# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

# Import the event data 
files = ['Persist_EventRawResultItem_1.parquet', 'Persist_EventRawResultItem_2.parquet']

df = pd.DataFrame()
for file in files:

    # Read in parquet file 
    in_df = pd.read_parquet('assets/' + file)

    # Filter for update data
    in_df = in_df[in_df['ProviderName'] == 'Microsoft-Windows-WindowsUpdateClient']

    # Add dataset to df 
    df = pd.concat([df, in_df], axis=0)

In [16]:
def clean_xml(in_str):

    # Clean the XML string
    xml_str2 = in_str.replace(' xmlns="http://schemas.microsoft.com/win/2004/08/events/event"','').replace('&lt;', '<').replace('&gt;', '>').replace('<![CDATA[', '').replace(']]>', '')

    #Blank dict to add data
    out_dict = {}

    # Parse the xml 
    root = ET.fromstring(xml_str2)

    # Get the update data 
    out_dict = {val.attrib["Name"]: val.text for val in root.find('.//EventData')}

    return out_dict

def expand_dict_values(in_df):

    # only get fields that are non-null
    expanded_fields = in_df['dict_EventData'][in_df['dict_EventData'].notnull()]

    # expand attributes
    expanded_fields = expanded_fields.apply(pd.Series)
        
    # Join with remaining data      
    result = pd.merge(in_df, expanded_fields, 
                        how = 'left', right_index = True, left_index=True)

    return result

In [17]:
df['dict_EventData'] = df['EventDataXML'].apply(lambda x: clean_xml(x))
out_df = expand_dict_values(df)

In [None]:
def anonymize_field(in_df, in_field):

    # Get unique values for the field 
    out_values = list(in_df[in_field].unique())

    # Define an output dataframe for ID mapping 
    output_df = pd.DataFrame({"Value": out_values, 
                                "id": list(range(1, len(out_values)+1))})

    # Convert data to dictionary
    output_df = output_df.set_index("Value")
    out_dict = output_df.to_dict(orient='index')

    # Apply the dictionary to the data to anonymize softwares
    in_df[in_field] = in_df[in_field].apply(lambda x: out_dict[x]['id'] if pd.notnull(x) else np.nan)

    # Save attribute-id mapping to json
    with open('assets/'+in_field +'.json', 'w') as f:
        json.dump(out_dict, f)

    return in_df

a = anonymize_field(out_df, 'updateTitle')

In [32]:
# Remove unneeded columns
unneeded = ['EventDataXML', 'dict_EventData']
out_df = out_df[[col for col in out_df.columns if col not in unneeded]]

In [35]:
out_df.to_parquet('assets/update_events.parquet', index=False)