## Feature Engineering: Addins, Programs, and Updates
This notebook is intended to engineer the features from `OFFICE_ADDIN_DATA`, `Add_Remove_Programs`, and update events from `EventRawResultItem`.

### Engineer update event features
Here I will create 3 features:
1.	Number of updates installed 
2.	Number of Windows 10 updates installed
3.	Number of office updates installed 

In [7]:
import pandas as pd
import numpy as np
import dtale

# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

def handle_none_values(in_val):

    if type(in_val) == type(None) or in_val == 'None':
        return np.nan
    
    else:
        return in_val
    
def read_json_fill_attr(jsonfile, in_df, attr):

    attr_df = pd.read_json(jsonfile, orient='index')
    in_dict = attr_df.reset_index().set_index('id').to_dict(orient='index')
    in_df[attr] = in_df[attr].apply(lambda x: in_dict[x]['index'])

    return in_df

def read_update_data(infile):

    # Read in INC and category df
    inc_df = pd.read_parquet(infile)

    # Filter out erroneous Nones in the data 
    for col in inc_df.columns:
        inc_df[col] = inc_df[col].apply(lambda x: handle_none_values(x))

    out_inc = read_json_fill_attr('assets/updateTitle.json', inc_df, 'updateTitle')

    return out_inc

# Get incident data 
df = read_update_data('assets/update_events.parquet')

In [11]:
def get_update_counts(in_dat):

    output_dict = {}

    # Get total number of updates
    output_dict['num_updates'] = len(in_dat)

    # Get the number of windows os updates
    output_dict['num_windows_64_os_updates'] = in_dat['win_64_os_update'].sum()

    # Get the number of office updates
    output_dict['num_office_updates'] = in_dat['office_update'].sum()

    # Return a series for the group 
    out_series = pd.Series(output_dict, index=list(output_dict.keys()))

    return out_series
 
def get_update_features(in_df):

    # lower case for updateTitle 
    in_df['updateTitle'] = in_df['updateTitle'].str.lower()

    # Change created system time to type datetime
    in_df['TimeCreatedSystemTime'] = pd.to_datetime(in_df['TimeCreatedSystemTime'])

    # Add created date 
    in_df['created_date'] = in_df['TimeCreatedSystemTime'].dt.strftime('%Y-%m-%d')

    # Create identifier for x64 based Windows OS updates 
    in_df['win_64_os_update'] = in_df['updateTitle'].apply(lambda x: 1 if ('cumulative update for' in x) or ('windows 10' in x) else 0)

    # Create identifier for office updates
    in_df['office_update'] = in_df['updateTitle'].apply(lambda x: 1 if 'office' in x else 0)

    # Group by and get results 
    out_gb = in_df.groupby(['ClientItemKey', 'created_date']).apply(get_update_counts).reset_index()

    return in_df, out_gb

# Get features
processd_df, grouped_df = get_update_features(df)

# Confirm results are expected
# dtale.show(grouped_df).open_browser()

# Save result to parquet
grouped_df.to_parquet('assets/update_summary_features.parquet')

### Engineer Addin Features 

In [14]:
import os 
import pyarrow.parquet as pq


def read_addin_data(addin_data_directory, chunk_size):

    # Get the files in the directory
    files = os.listdir(addin_data_directory)

    for file in files:
        if file.endswith('.parquet'):

            filepath = os.path.join(addin_data_directory, file)
            
            # Read in the parquet file
            parquet_data = pq.ParquetFile(filepath)

            # Process the data in chunks
            for batch in parquet_data.iter_batches(chunk_size):

                # Read the chunk of data from Parquet
                chunk = batch.to_pandas()

                # Fill friendly name, product name, and company name with values
                friendlyname = os.path.join(addin_data_directory, 'FriendlyName.json')
                companyname = os.path.join(addin_data_directory, 'CompanyName.json')
                productname = os.path.join(addin_data_directory, 'ProductName.json')
                chunk = read_json_fill_attr(friendlyname, chunk, 'FriendlyName00')
                chunk = read_json_fill_attr(companyname, chunk, 'CompanyName00')
                chunk = read_json_fill_attr(productname, chunk, 'ProductName00')

                yield chunk


dfs = read_addin_data('assets/office_addin_data', 1000000)

addin_df = next(dfs)

In [16]:
test_df = addin_df.drop('rowversion', axis=1)
dtale.show(test_df).open_browser()

In [21]:
import re 

def identify_addin(in_dat, attr_name, re_pat):

    in_dat[attr_name] = in_dat['FriendlyName00'].apply(lambda x: 1 if re.search(re_pat, x) else 0)

    return in_dat

def create_addin_features(in_df):

    # Change effective date to type date
    in_df['RWB_EFFECTIVE_DATE'] = pd.to_datetime(in_df['RWB_EFFECTIVE_DATE']).dt.strftime('%Y-%m-%d')

    # Create identifier for CAP IQ 
    cap_pat = 'Cap IQ|Capital IQ|cap iq|capital iq'
    in_df = identify_addin(in_df, 'has_cap_iq', cap_pat)

    # Create identifier for FactSet 
    cap_pat = 'FactSet|factset'
    in_df = identify_addin(in_df, 'has_factset', cap_pat)

    # Create identifier for BlueMatrix, Bloomberg, and acrobat
    in_df['has_bluematrix'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'BlueMatrix I LLC' else 0)
    in_df['has_bloomberg'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'Bloomberg LP' else 0)
    in_df['has_acrobat'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'Adobe Systems Incorporated' else 0)

    out_gb = in_df.groupby(['MachineID', 'RWB_EFFECTIVE_DATE', 'Architecture00']).apply(lambda x: pd.Series({"num_addins": len(x['Id00'].unique())}, index=["num_addins"]))

    return in_df, out_gb

processed_df, grouped_df = create_addin_features(test_df)

In [22]:
# Toggle to view in dtale.
# dtale.show(processed_df).open_browser()