In [None]:
import pandas as pd
import numpy as np
from functools import reduce
import missingno as msno
import altair as alt

In [None]:
# These are the datasets uploaded by Jacob to blob storage:
df_app = pd.read_parquet('../do_not_commit/Datasets/AppErrorEvents.parquet')
df_win = pd.read_parquet('../do_not_commit/Datasets/WindowsErrorEvents.parquet')
df_hang = pd.read_parquet('../do_not_commit/Datasets/AppHangEvents.parquet')

In [None]:
print(len(df_app))
print(len(df_win))
print(len(df_hang))

In [None]:
df_app['error_type'] = "Application"
df_win['error_type'] = 'Windows'
df_hang['error_type'] = 'Hang'


Do this in one or two ways:
1. Add use suffixes= in pd.merge() to explicitly assign each column/variable to its original error type.
2. Combine values for common columns into the same column, add a new column for each of the 4 for its error type, e.g. "Hang", and not assign column suffixes (use pd.concat([dataframes])).

Given from the original XML exercise columns with the same name in different sets represented the same variable, preferred approach is #2.


In [None]:
# Visualize codes by EventName in df_win['EventName'].
def get_compare_chart(in_data, in_col, in_title):
 count_df2 = in_data.groupby(in_col).size().reset_index().rename(columns={0:'count'})
 # Sort order for chart
 count_df2.sort_values('count', ascending = False, inplace=True)
 sort_order = [val for val in count_df2[in_col].unique()]
 out_chart = alt.Chart(count_df2).mark_bar().encode(
     x = alt.X('count:Q'),
     y = alt.Y(in_col+':N', sort=sort_order),
     tooltip ='count:Q'
 ).properties(title = in_title)
 return out_chart


def show_bar_for_win_event(eventname_list, in_df, visualize_attr):
  for eventname in eventname_list:
    current_df = in_df[in_df['EventName'] == eventname]
    out_chart = get_compare_chart(current_df, visualize_attr, f'Counts of {visualize_attr} types for {eventname} events')
    yield out_chart
# Get the event names
eventnames = df_win['EventName'].unique()
charts = show_bar_for_win_event(eventnames, df_win, 'ProblemSignatureP3')

In [None]:
# current_chart = next(charts)
# current_chart

In [None]:
eventnames

In [None]:
print(df_win['ProblemSignatureP3'].str.contains('.dll').sum())
print(len(df_win))


-  crashpad_exp has only .dll EventItems
-  APPCRASH has 1 .dll and many alphanumeric codes
-  AppHangB1 has only hexadecimal codes
-  MoAppHang has IP-style codes
-  BEX64 has only alphanumeric codes
-  MoAppCrash has IP-style codes
-  MoBEX has IP-style codes
-  CLR20r3 has hexadecimal codes
-  BEX has hexadecimal codes
-  AppHangXProcB1 has hexadecimal codes
-  LiveKernelEvent has single numbers, single letters, and (many) alphanumeric codes
-  MoAppHangXProc events has 2 IP-style codes
-  POFContextAppCrash has IP-style codes
-  ServiceHang has IP-style codes
-  CbsPackageServicingFailure2 has IP-style codes
-  CriticalProcessFault2 has 1 alphanumeric code and 1 00000000 code
-  crashpad_jserror has webui-pdf code
-  WUDFVerifierFailure has 'Driver 'code


In [None]:
df_win[df_win['EventName'] == 'APPCRASH']['ProblemSignatureP3'].unique()

In [None]:
print(df_win['ProblemSignatureP3'].str.contains('\.dll').sum())
print(df_win['ProblemSignatureP6'].str.contains('\.dll').sum())
print(df_win['ProblemSignatureP8'].str.contains('\.dll').sum())
print(df_win['ProblemSignatureP9'].str.contains('\.dll').sum())
print(df_win['ProblemSignatureP10'].str.contains('\.dll').sum())

In [None]:
df_win.dropna(subset='ProblemSignatureP6')[df_win.dropna(subset='ProblemSignatureP6')['ProblemSignatureP6'].str.contains('\.dll')]

In [None]:
# COALESCE() in order: https://www.statology.org/pandas-coalesce/
df_win['Combined_dll'] = df_win[['ProblemSignatureP3', 'ProblemSignatureP6']].bfill(axis=1).iloc[:, 0]
df_win['Combined_dll'] = df_win['Combined_dll'].apply(lambda x: x if '.dll' in str(x) else np.nan)
df_win.drop(columns=['ProblemSignatureP3', 'ProblemSignatureP6'], inplace=True)

In [None]:
df_app.rename(
    columns={
        'FaultingApplicationName':'Combined_Application',
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'AppVersion':'Combined_Version',
        'ExceptionCode':'Combined_Exception',
        'FaultingProcessId':'Combined_ProcessID',
        'ReportId':'Combined_ReportID',
        'FaultingApplicationStartTime':'Combined_StartTime',
        'FaultingModuleName':'Combined_dll'
    },
    inplace=True
)

df_win.rename(
    columns={
        'ProblemSignatureP1_Application':'Combined_Application',
        'ProblemSignatureP2_AppVersion':'Combined_Version',
        'ProblemSignatureP7_ExceptionCode':'Combined_Exception',
        'ReportID':'Combined_ReportID',
        'CabGuid':'CabGUID'
    },
    inplace=True
)

df_hang.rename(
    columns={
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'Program':'Combined_Application',
        'ProgramVersion':'Combined_Version',
        'ProcessID':'Combined_ProcessID',
        'ReportID':'Combined_ReportID',
        'StartTime':'Combined_StartTime'
    },
    inplace=True
)

In [None]:
events = pd.concat([df_app, df_win, df_hang])
events.info()

In [None]:
# Dropping columns with nothing in them.
events.dropna(axis=1, how='all', inplace=True)

In [None]:
msno.matrix(events.iloc[:, 20:], labels=True, fontsize=8)

In [None]:
# Investigate individual columns' data to see if some can be combined.
events['Combined_Application'].dropna().sample(10)


# Join Machine data with Events data.


In [None]:
machines = pd.read_parquet('../do_not_commit/Datasets/Persist_System_DISC.pq')

In [None]:
msno.matrix(machines, labels=True, fontsize=8)

In [None]:
# From blob storage:
incs = pd.read_csv('../do_not_commit/Datasets/ServiceNow_Incident.csv', low_memory=False)

In [None]:
msno.matrix(incs, labels=True, fontsize=8)

In [None]:
machines['ClientItemKey'] = machines['ItemKey'].copy()

In [None]:
machines_events = events.merge(machines[['RWB_EFFECTIVE_DATE', 'ClientItemKey', 'Name0']], on=['RWB_EFFECTIVE_DATE', 'ClientItemKey'])

In [None]:
machines_events.sample(5)

In [None]:
msno.matrix(machines_events, labels=True, fontsize=8)

In [None]:
machines_events.sort_values(by=['error_type'])['error_type'].hist()


# Join Machines data with Boot data.


In [None]:
df_boot = pd.read_parquet('../do_not_commit/Datasets/BootEvents.parquet')
print(len(df_boot))

In [None]:
df_boot.rename(
    columns={
        'BootId':'BootID',
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'AppVersion':'Combined_Version',
        'ExceptionCode':'Combined_Exception',
        'ReportId':'ReportID'
    },
    inplace=True
)

In [None]:
msno.matrix(df_boot, labels=True, fontsize=8)

In [None]:
events['Name0'].sample(10)

In [None]:
common_columns = reduce(np.intersect1d, (df_app.columns, df_boot.columns, df_hang.columns, df_win.columns))
common_columns

In [None]:
df_boot['BootID'] = df_boot['BootID'].dt.strftime('%Y-%m-%d')

In [None]:
machines_boot = df_boot.merge(machines[['RWB_EFFECTIVE_DATE', 'ClientItemKey', 'Name0']], on=['RWB_EFFECTIVE_DATE', 'ClientItemKey'])

In [None]:
machines_boot


#### Join INCs to machine + event.


In [None]:
# TimeCreatedSystemTime is all NaT.
# Use 'GeneratedTime' or 'BootId' instead of 'TimeCreatedSystemTime' for 'Boot' errors?
machines_events[machines_events['error_type'].isin(['Application', 'Hang'])].sample(5)

In [None]:
# DateTime formatting.
incs['opened_at'] = pd.to_datetime(incs['opened_at'])
incs['opened_at_formatted'] = incs['opened_at'].dt.strftime('%Y-%m-%d')
machines_events['TimeCreatedSystemTimeFormatted'] = machines_events['TimeCreatedSystemTime'].dt.strftime('%Y-%m-%d')

In [None]:
# incs_merged = machines_merged.merge(incs, left_on='Name0', right_on='configuration_item')
incs_merged = machines_events.merge(incs, left_on=['Name0', 'TimeCreatedSystemTimeFormatted'], right_on=['configuration_item', 'opened_at_formatted'], how='left')

In [None]:
print(len(incs_merged))

In [None]:
# Dropping columns with nothing in them.
incs_merged.dropna(axis=1, how='all', inplace=True)

In [None]:
msno.matrix(incs_merged, labels=True, fontsize=7)

In [None]:
# Issues:
# If a separate error is recorded on the same day as an unrelated INC, the two are associated (.loc[210, :])
# Possible fix: reduce the time window from "same day" to "within x hours," e.g. +/-2hrs.
incs_merged.sample(10)[[
    'error_type',
    # 'OSVersion',
    'Combined_Application',
    'Combined_dll',
    # 'FaultingPackageFullName',
    'ProductName',
    'category',
    'subcategory',
    'short_description_NER'
]]

In [None]:
incs_merged['number'].unique()

In [None]:
# INCs w/ multiple events assc. may attribute the event to the INC
# despite the event occurring *after* the INC was already filed.
incs_merged[incs_merged['number'] == 'INC0515837'][[
    'TimeCreatedSystemTimeFormatted',
    'TimeCreatedSystemTime',
    'opened_at',
    'opened_at_formatted',
    'Name0',
    'configuration_item',
    'number',
    # 'FaultingApplicationName',
    # 'FaultingModuleName',
    'short_description_NER'
]]


#### Join INCs to machine + boot.


In [None]:
incs_boot = machines_boot.merge(incs, left_on=['Name0', 'BootID'], right_on=['configuration_item', 'opened_at_formatted'], how='left')

In [None]:
incs_boot

In [None]:
# Dropping columns with nothing in them.
incs_boot.dropna(axis=1, how='all', inplace=True)


#### How many multiple INCs were filed on the same day for a given machine?


In [None]:
grouped_counts = incs.groupby(['opened_at_formatted', 'configuration_item']).size()
print(grouped_counts[grouped_counts > 1].sum())
grouped_counts[grouped_counts > 1]


# Bring both DataFrames together.


In [None]:
df = pd.concat([incs_merged, incs_boot])
df.reset_index(inplace=True)

In [None]:
# Dropping columns with nothing in them.
df.dropna(axis=1, how='all', inplace=True)

In [None]:
df.shape

In [None]:
msno.matrix(df, labels=True, fontsize=7)

In [None]:
df.loc[400000, :]

In [None]:
df['error_type'].hist()

In [None]:
df[df['error_type'] == 'Boot']


#### How many INCs contain direct reference to BSOD?


In [None]:
len(df[df['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")]['number'].unique())

In [None]:
df[df['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")].drop_duplicates('number', keep='first')['error_type'].hist()

In [None]:
df[(df['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath"))].drop_duplicates('number', keep='first')[['error_type', 'FaultingApplicationName', 'FaultingModuleName', 'ExceptionCode']]

In [None]:
df[(df['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")) & (df['error_type'] == 'Application')].drop_duplicates('number', keep='first')['FaultingApplicationName']

In [None]:
df[(df['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")) & (df['error_type'] == 'Boot')].drop_duplicates('number', keep='first')['short_description_NER']


#### What is the distribution of FaultApplicationName among INCs?


In [None]:
df_check = df[df['error_type'] == 'Application']

In [None]:
df_check['FaultingApplicationName'].value_counts()


#### What is the distribution of modules among INCs?


In [None]:
df_check['FaultingModuleName'].value_counts()


#### Which exception codes are most common among INCs?


In [None]:
df_check[['FaultingApplicationName', 'ExceptionCode']].value_counts()