In [None]:
import pandas as pd
import numpy as np
from functools import reduce
import pytz
import altair as alt

In [None]:
# These are the datasets uploaded by Jacob to blob storage:
df_app = pd.read_parquet('../do_not_commit/Datasets/AppErrorEvents.parquet')
df_win = pd.read_parquet('../do_not_commit/Datasets/WindowsErrorEvents.parquet')
df_hang = pd.read_parquet('../do_not_commit/Datasets/AppHangEvents.parquet')

In [None]:
print(len(df_app))
print(len(df_win))
print(len(df_hang))

In [None]:
df_app['error_type'] = "Application"
df_win['error_type'] = 'Windows'
df_hang['error_type'] = 'Hang'

In [None]:
def get_compare_chart(in_data, in_col, in_title):
    """
    # Visualize codes by EventName in df_win['EventName'].

    :param in_data:
    :param in_col:
    :param in_title:
    :return:
    """

    count_df2 = in_data.groupby(in_col).size().reset_index().rename(columns={0: 'count'})
    # Sort order for chart
    count_df2.sort_values('count', ascending=False, inplace=True)
    sort_order = [val for val in count_df2[in_col].unique()]
    out_chart = alt.Chart(count_df2).mark_bar().encode(
        x=alt.X('count:Q'),
        y=alt.Y(in_col + ':N', sort=sort_order),
        tooltip='count:Q'
    ).properties(title=in_title)
    return out_chart


def show_bar_for_win_event(eventname_list, in_df, visualize_attr):
    for eventname in eventname_list:
        current_df = in_df[in_df['EventName'] == eventname]
        out_chart = get_compare_chart(current_df, visualize_attr,
                                      f'Counts of {visualize_attr} types for {eventname} events')
        yield out_chart

In [None]:
# Get the event names
eventnames = df_win['EventName'].unique()
charts = show_bar_for_win_event(eventnames, df_win, 'ProblemSignatureP3')

In [None]:
# Uncomment below to cycle through charts:
# current_chart = next(charts)
# current_chart

In [None]:
print(df_win['ProblemSignatureP3'].str.contains('.dll').sum())
print(len(df_win))

In [None]:
# COALESCE() in order: https://www.statology.org/pandas-coalesce/
df_win['Combined_dll'] = df_win[['ProblemSignatureP3', 'ProblemSignatureP6']].bfill(axis=1).iloc[:, 0]
df_win['Combined_dll'] = df_win['Combined_dll'].apply(lambda x: x if '.dll' in str(x) else np.nan)
df_win.drop(columns=['ProblemSignatureP3', 'ProblemSignatureP6'], inplace=True)

In [None]:
df_app.rename(
    columns={
        'FaultingApplicationName':'Combined_Application',
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'AppVersion':'Combined_Version',
        'ExceptionCode':'Combined_Exception',
        'FaultingProcessId':'Combined_ProcessID',
        'ReportId':'Combined_ReportID',
        'FaultingApplicationStartTime':'Combined_StartTime',
        'FaultingModuleName':'Combined_dll'
    },
    inplace=True
)

df_win.rename(
    columns={
        'ProblemSignatureP1_Application':'Combined_Application',
        'ProblemSignatureP2_AppVersion':'Combined_Version',
        'ProblemSignatureP7_ExceptionCode':'Combined_Exception',
        'ReportID':'Combined_ReportID',
        'CabGuid':'CabGUID'
    },
    inplace=True
)

df_hang.rename(
    columns={
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'Program':'Combined_Application',
        'ProgramVersion':'Combined_Version',
        'ProcessID':'Combined_ProcessID',
        'ReportID':'Combined_ReportID',
        'StartTime':'Combined_StartTime'
    },
    inplace=True
)

In [None]:
events = pd.concat([df_app, df_win, df_hang])
events.reset_index(inplace=True)
events.dropna(axis=1, how='all', inplace=True) # Dropping columns with nothing in them.
# msno.matrix(events.iloc[:, 20:], labels=True, fontsize=8)

In [None]:
len(events)


<a id="machines_events"></a>
# Join events with machines.


In [None]:
machines = pd.read_parquet('../do_not_commit/Datasets/Persist_System_DISC.pq')

In [None]:
machines['ClientItemKey'] = machines['ItemKey'].copy()

In [None]:
# Join machine name ('Name0') to events by internal date and ClientItemKey (a different, but similar ID for a machine).
# Because events is left, every row will have a TimeCreatedSystemTime.
machines_events = events.merge(machines[['RWB_EFFECTIVE_DATE', 'ClientItemKey', 'Name0']], on=['RWB_EFFECTIVE_DATE', 'ClientItemKey'], how='left')

In [None]:
# Convert the TimeCreatedSystemTime CST column from Datetime to a date datatype (formatted like YYYY-MM-DD).
machines_events['TimeCreatedSystemTimeFormatted'] = machines_events['TimeCreatedSystemTime'].dt.strftime('%Y-%m-%d')

In [None]:
# Convert local time values to Central.
def convert_to_cst(df:pd.DataFrame, utc_column_name:str, cst_column_name:str):
    """
    Create a new column in the events dataframe that converts the TimeCreatedSystemTime from UTC to CST.
    (this is necessary because RWB_EFFECTIVE_DATE is in CST)
    """

    # Make sure the UTC column is in datetime format
    df[utc_column_name] = pd.to_datetime(df[utc_column_name])

    # Convert to UTC timezone
    utc_timezone = pytz.timezone('UTC')
    df[utc_column_name] = df[utc_column_name].dt.tz_localize(utc_timezone)

    # Convert to Central Standard Time (CST) timezone
    cst_timezone = pytz.timezone('America/Chicago')
    df[cst_column_name] = df[utc_column_name].dt.tz_convert(cst_timezone)

    # Drop the original UTC column if desired (optional)
    # df.drop(columns=[utc_column_name], inplace=True)

    return df

In [None]:
machines_events = convert_to_cst(machines_events, utc_column_name='TimeCreatedSystemTime', cst_column_name='CreatedSystemTime_CST')
machines_events[['TimeCreatedSystemTime', 'CreatedSystemTime_CST']].sample(5) # Check all times are -05:00 or -06:00 from UTC (Daylight Savings Time).

In [None]:
# Convert the TimeCreatedSystemTime CST column from Datetime to a date datatype (formatted like YYYY-MM-DD).
machines_events['CreatedSystemTime_CST_formatted'] = machines_events['CreatedSystemTime_CST'].dt.strftime('%Y-%m-%d')
machines_events['CreatedSystemTime_CST_formatted'][0]

In [None]:
type(pd.to_datetime('2023-07-23 00:01:00').date())


# Join Boot events with machines separately.


In [None]:
df_boot = pd.read_parquet('../do_not_commit/Datasets/BootEvents.parquet')
print(len(df_boot))

In [None]:
df_boot['error_type'] = 'Boot'

In [None]:
# What columns do all four event types have in common?
common_columns = reduce(np.intersect1d, (df_app.columns, df_boot.columns, df_hang.columns, df_win.columns))

In [None]:
df_boot.rename(
    columns={
        'BootId':'BootID',
        'ProgramId':'ProgramID',
        'FileId':'FileID',
        'AppVersion':'Combined_Version',
        'ExceptionCode':'Combined_Exception',
        'ReportId':'ReportID'
    },
    inplace=True
)

In [None]:
# Needs to be a str to join later w/ 'opened_at_formatted'.
# BootID is synonymous with 'TimeCreatedSystemTime'
# since it is the time on the machine when the boot event occurred.
df_boot['BootID_formatted'] = df_boot['BootID'].dt.strftime('%Y-%m-%d')

In [None]:
# Join machine name to boot events, similar to other section.
machines_boot = df_boot.merge(machines[['RWB_EFFECTIVE_DATE', 'ClientItemKey', 'Name0']], on=['RWB_EFFECTIVE_DATE', 'ClientItemKey'])


# Join machines and events with INCs.


In [None]:
# From blob storage:
# incs = pd.read_csv('../do_not_commit/Datasets/ServiceNow_Incident.csv', low_memory=False)
incs = pd.read_csv('../do_not_commit/Datasets/ServiceNow_INC_20230730.csv', low_memory=False)

In [None]:
# DateTime formatting.
incs['opened_at'] = pd.to_datetime(incs['opened_at'])
incs['opened_at_formatted'] = incs['opened_at'].dt.strftime('%Y-%m-%d')

In [None]:
# Join INCs to events on:
# 1. Machine name, and
# 2. The event and the INC both occurred on the same day, using %Y-%m-%d format (YYYY-MM-DD).
incs_merged = machines_events.merge(incs, left_on=['Name0', 'CreatedSystemTime_CST_formatted'], right_on=['configuration_item', 'opened_at_formatted'], how='left')
print(len(incs_merged))

In [None]:
# Drop columns with nothing in them.
incs_merged.dropna(axis=1, how='all', inplace=True)

In [None]:
# Should be many more. Why only still 4,000?
print(len(incs), "INC rows originally")
print("Now", incs_merged['number'].notnull().sum())


#### Remove INCs assc. w/ events *after* the INC was already filed.


In [None]:
incs_merged['opened_at_cst_not_utc'] = incs_merged['opened_at'].dt.tz_localize('US/Central')

In [None]:
print("CreatedSystemTime_CST:", incs_merged.loc[74, 'CreatedSystemTime_CST'])
print("INC file time CST:    ", incs_merged.loc[74, 'opened_at_cst_not_utc'])
print("CreatedCST-opened_at: ", incs_merged.loc[74, 'CreatedSystemTime_CST'] - incs_merged.loc[74, 'opened_at_cst_not_utc'])
# This event happened before the INC was filed, at 13:28 UTC vs 16:20 UTC (2h 52m):

In [None]:
print("CreatedSystemTime_CST:", incs_merged.loc[75, 'CreatedSystemTime_CST'])
print("INC file time CST:    ", incs_merged.loc[75, 'opened_at_cst_not_utc'])
print("CreatedCST-opened_at: ", incs_merged.loc[75, 'CreatedSystemTime_CST'] - incs_merged.loc[74, 'opened_at_cst_not_utc'])
# This event happened after the INC was filed, at 20:23 UTC vs 16:20 UTC (4h 3m):

In [None]:
# Delete INCs across rows when the event occurs before the INC on the same day.
mask = incs_merged['CreatedSystemTime_CST'] >= incs_merged['opened_at_cst_not_utc']
columns_to_set_none = incs.columns
incs_merged.loc[mask, columns_to_set_none] = None

# Looks like no 'CreatedSystemTime_CST value is > 'opened_at'.
incs_merged[incs_merged['number'].notnull()][['CreatedSystemTime_CST', 'opened_at', 'number']]

In [None]:
# Check to make sure. Positive = delete. Negative = keep.
(incs_merged['CreatedSystemTime_CST'] - incs_merged['opened_at_cst_not_utc']).dropna()

In [None]:
# Timedelta of -1 days (24h) + 16:04 = 8h 04m difference in true time.
incs_merged.loc[502, ['CreatedSystemTime_CST', 'opened_at_cst_not_utc']]


# Join machines and boot events with INCs separately.


In [None]:
incs_boot = machines_boot.merge(incs, left_on=['Name0', 'BootID_formatted'], right_on=['configuration_item', 'opened_at_formatted'], how='left')

In [None]:
print(incs_boot['number'].notnull().sum(), "Boot INCs")

In [None]:
# Dropping columns with nothing in them.
incs_boot.dropna(axis=1, how='all', inplace=True)

In [None]:
# How many multiple INCs were filed on the same day for a given machine?
grouped_counts = incs.groupby(['opened_at_formatted', 'configuration_item']).size()
print(grouped_counts[grouped_counts > 1].sum())
grouped_counts[grouped_counts > 1]

In [None]:
incs_boot[incs_boot['number'].notnull()]


# Bring machines, events, and INCs together for both regular events and Boot events.


In [None]:
df = pd.concat([incs_merged, incs_boot])
df.reset_index(inplace=True)

# Dropping columns with nothing in them.
df.dropna(axis=1, how='all', inplace=True)

# Drop all data for May 20th, 2023.
df = df[df['RWB_EFFECTIVE_DATE'] != '2023-05-20']

In [None]:
len(df[df['error_type'].isin(['Application', 'Hang', 'Windows'])])

In [None]:
len(df[df['error_type'] != 'Boot'])

In [None]:
df['error_type'].hist()

In [None]:
print(df.shape)
df.dtypes

In [None]:
# How many boot events occurred after an INC was filed?
# df[df['error_type'] == 'Boot']['BootID'] - df[df['error_type'] == 'Boot']['opened_at_cst_not_utc']


# Create 'num_events' feature.


In [None]:
'CreatedSystemTime_CST_formatted' in list(df.columns)

In [None]:
# Because 'TimeCreatedSystemTime' never exists in 'df_boot',
# 'CreatedSystemTime_CST_formatted' will always be pd.NaT for Boot events,
# because it is unique to the regular events only.
df[df['error_type'] == 'Boot']['CreatedSystemTime_CST'].unique()

In [None]:
# Create the events feature by grouping by ClientItemKey and the TimeCreatedSystemTime CST date column,
# and get the count of events for each machine each day.
# Boot events excluded by default on basis of the .groupby() columns.

# num_events = df.groupby(['RWB_EFFECTIVE_DATE', 'ClientItemKey']).agg('size').reset_index().rename(columns={0:'events'})
num_events = df[df['error_type'] != 'Boot'].groupby(['ClientItemKey', 'CreatedSystemTime_CST_formatted']).agg('size').reset_index().rename(columns={0:'events'})
num_events

In [None]:
num_events['events'].hist(bins=30)

In [None]:
check_this_df = pd.read_parquet('../do_not_commit/FeatureDatasets/num_events.pq')
print(check_this_df['events'].sum())
check_this_df

In [None]:
# Export the result to parquet and save to blob storage.
num_events.to_parquet('../do_not_commit/FeatureDatasets/num_events.pq')


# Create 'num_events_incs' feature.


In [None]:
# 'opened_at_formatted' was wiped for all events (rows) after an INC was filed on same day
# in the join machines + events + incs section. If no value exists for any events
# after an INC was filed, subsetting with .notnull() should give us what we need.

num_events_inc = df[
    (df['error_type'] != 'Boot') &
    (df['opened_at_formatted'].notnull())
].groupby(['ClientItemKey', 'CreatedSystemTime_CST_formatted']).agg('size').reset_index().rename(columns={0:'events'})
num_events_inc

In [None]:
# 'CreatedSystemTime_CST' is in CST, 'opened_at' is in UTC, a difference of 6 hours.
df[(df['ClientItemKey'] == 16790461) & (df['CreatedSystemTime_CST_formatted'] == '2023-02-22')][['CreatedSystemTime_CST', 'opened_at']]

In [None]:
num_events_inc['events'].hist(bins=30)

In [None]:
check_this_df = pd.read_parquet('../do_not_commit/FeatureDatasets/num_events_inc.pq')
print(check_this_df['events'].sum())
check_this_df

In [None]:
# Export the result to parquet and save to blob storage.
num_events_inc.to_parquet('../do_not_commit/FeatureDatasets/num_events_inc.pq')


#### How many INCs contain direct reference to BSOD?


In [None]:
df['short_description_NER'].unique()

In [None]:
# How many short descriptions talk about BSODs?
len(df.dropna(subset='short_description_NER')[df.dropna(subset='short_description_NER')['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")]['number'].unique())

In [None]:
# What is the frequency of each error type for BSOD tickets?
df.dropna(subset='short_description_NER')[df.dropna(subset='short_description_NER')['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")].drop_duplicates('number', keep='first')['error_type'].hist()

In [None]:
df.dropna(subset='short_description_NER')[(df.dropna(subset='short_description_NER')['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath"))].drop_duplicates('number', keep='first')[['error_type', 'Combined_Application', 'Combined_dll', 'Combined_Exception']]

In [None]:
df.dropna(subset='short_description_NER')[(df.dropna(subset='short_description_NER')['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")) & (df['error_type'] == 'Application')].drop_duplicates('number', keep='first')['Combined_Application']

In [None]:
df.dropna(subset='short_description_NER')[(df.dropna(subset='short_description_NER')['short_description_NER'].str.contains("(?i)Blue Screen|BSOD|[Dd]eath")) & (df['error_type'] == 'Boot')].drop_duplicates('number', keep='first')['short_description_NER']


#### What is the distribution of FaultApplicationName among INCs?


In [None]:
df_check = df[df['error_type'] == 'Application']

In [None]:
df_check['Combined_Application'].value_counts()


#### What is the distribution of modules among INCs?


In [None]:
df_check['Combined_dll'].value_counts()


#### Which exception codes are most common among INCs?


In [None]:
df_check[['Combined_Application', 'Combined_Exception']].value_counts()

### Create Incident category and faulting applications figure for report

In [None]:

df = incs_merged
if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)


df = df.query("""`number`.str.contains('INC', na=False, case=False, regex=False)""")

s2 = df[~pd.isnull(df['Combined_Application'])]

def get_compare_chart(in_data, in_col, x_axis_title, y_axis_title, in_title):
    """
    Visualize the top n most common faulting applications 
    """

    count_df2 = in_data.groupby(in_col).size().reset_index().rename(columns={0: 'count'})
    # Sort order for chart
    count_df2.sort_values('count', ascending=False, inplace=True)
    count_df2 = count_df2.iloc[:10]
    sort_order = [val for val in count_df2[in_col].unique()]
    out_chart = alt.Chart(count_df2).mark_bar().encode(
        x=alt.X('count:Q', axis=alt.Axis(title=x_axis_title)),
        y=alt.Y(in_col + ':N',axis=alt.Axis(title=y_axis_title), sort=sort_order),
        tooltip='count:Q'
    ).properties(title=in_title)
    return out_chart

get_compare_chart(s2, 'Combined_Application', 'No. of Events', 'Faulting Application Name', '')

In [None]:
### CATEGORY AND SUBCATEGORY Visuals ### 

def get_compare_chart(in_data, in_col, x_axis_title, y_axis_title, in_title):
    """
    Visualize the top n most common faulting applications 
    """

    count_df2 = in_data.groupby(in_col).size().reset_index().rename(columns={0: 'count'})
    # Sort order for chart
    count_df2.sort_values('count', ascending=False, inplace=True)
    count_df2 = count_df2.iloc[:10]
    sort_order = [val for val in count_df2[in_col].unique()]
    axis_config = alt.AxisConfig(titleFontSize=15, labelFontSize=11) 
    out_chart = alt.Chart(count_df2).mark_bar().encode(
        x=alt.X('count:Q', axis=alt.Axis(title=x_axis_title, 
                                         titleFontSize=axis_config.titleFontSize,
                                           labelFontSize=axis_config.labelFontSize)),
        y=alt.Y(in_col + ':N',axis=alt.Axis(title=y_axis_title, 
                                            titleFontSize=axis_config.titleFontSize, 
                                            labelFontSize=axis_config.labelFontSize),
                                              sort=sort_order),
        tooltip='count:Q'
    ).properties(title=in_title)
    return out_chart


out = get_compare_chart(s, 'category', 'No. of Incidents', 'Incident Category', '') | get_compare_chart(s, 'subcategory',  'No. Incidents', 'Incident Subcategory', '') 

out