# Analysis of kernels leading to SDC-critical
The analysis results obtained in this notebook are used for plotting SDC-critical counts vs kernels

In [1]:
#%cd kernels_sdc_critical_counts_dataframes
#%cd kernels_sdc_critical_counts_dataframes/counts_dataframes/
#%cd kernels_sdc_critical_counts_dataframes/complete_dataframes/
%cd kernels_sdc_critical_counts_dataframes/

/media/sergiu/D8989FA0989F7C26/Users/sergi/Documents/thesis_logs/kernels_sdc_critical_counts_dataframes


In [2]:
MACHINE = 'workstation' #'laptop' 
DATASET = 'salinas' #'pavia_uni' 'indianPines'
#INSTR_GROUP = 'G_GP' #'G_FP32'
PCA = 'PCA10_hardened' #'PCA10' 'PCA7' 'PCA50' 

## Load dataframes

In [5]:
import pandas as pd

dataframes_fp32 = pd.HDFStore(f'./complete_dataframes/{MACHINE}_{PCA}_{DATASET}_G_FP32_dataframes.h5')
dataframes_gp = pd.HDFStore(f'./complete_dataframes/{MACHINE}_{PCA}_{DATASET}_G_GP_dataframes.h5')

counts_fp32 = pd.HDFStore(f'./counts_dataframes/{MACHINE}_{PCA}_{DATASET}_G_FP32_counts.h5')
counts_gp = pd.HDFStore(f'./counts_dataframes/{MACHINE}_{PCA}_{DATASET}_G_GP_counts.h5')

In [6]:
counts_fp32.keys()

['/kernel_error_counts',
 '/kernel_masked_counts',
 '/kernel_sdc_critical_counts',
 '/kernel_sdc_safe_counts',
 '/opcode_masked_counts',
 '/opcode_sdc_critical_counts',
 '/opcode_sdc_safe_counts',
 '/register_masked_counts',
 '/register_sdc_critical_counts',
 '/register_sdc_safe_counts']

# Convert DUE counts to excel sheets

In [10]:
df_sdc_critical_enhanced = {}
#for pca in ['PCA7', 'PCA10', 'PCA50']:
for pca in ['PCA10_hardened']:
    for machine in ['workstation', 'laptop']:
        for dataset in ['indianPines', 'pavia_uni', 'salinas']:
            #key = f'{machine}_{pca}_{dataset}'

            counts_fp32 = pd.HDFStore(f'./counts_dataframes/{machine}_{pca}_{dataset}_G_FP32_counts.h5')
            counts_gp = pd.HDFStore(f'./counts_dataframes/{machine}_{pca}_{dataset}_G_GP_counts.h5')

            if '/kernel_error_counts' in counts_fp32.keys():
                counts_fp32['kernel_error_counts'].to_excel(f'../due_spreadsheets/{machine}_{pca}_{dataset}_G_FP32.xlsx')

            if '/kernel_error_counts' in counts_gp.keys():
                counts_gp['kernel_error_counts'].to_excel(f'../due_spreadsheets/{machine}_{pca}_{dataset}_G_GP.xlsx')

In [4]:
#counts_fp32.keys()
dataframes_fp32.keys()

['/df_error',
 '/df_masked',
 '/df_sdc_critical',
 '/df_sdc_critical_enhanced',
 '/df_sdc_safe',
 '/df_sdc_safe_enhanced']

# Number of total Masked, SDC-safe, SDC-critical and DUEs

G_GP stats

In [None]:
TOTAL = 1000 # 200

if MACHINE == 'laptop' and DATASET in ['salinas', 'pavia_uni']:
    TOTAL = 200

print(f"Nr. DUE: {TOTAL - counts_gp['kernel_masked_counts'].sum() - counts_gp['kernel_sdc_safe_counts'].sum() - counts_gp['kernel_sdc_critical_counts'].sum()}")
print(f"Nr. Masked: {counts_gp['kernel_masked_counts'].sum()}")
print(f"Nr. SDC-safe: {counts_gp['kernel_sdc_safe_counts'].sum()}")
print(f"Nr. SDC-critical: {counts_gp['kernel_sdc_critical_counts'].sum()}")

#print(f"Total: {counts_gp['kernel_error_counts'].sum() + counts_gp['kernel_masked_counts'].sum() + counts_gp['kernel_sdc_safe_counts'].sum() + counts_gp['kernel_sdc_critical_counts'].sum()}")

G_FP32

In [None]:
TOTAL = 1000 # 200

if MACHINE == 'laptop' and DATASET in ['salinas', 'pavia_uni']:
    TOTAL = 200

print(f"Nr. DUE: {TOTAL - counts_fp32['kernel_masked_counts'].sum() - counts_fp32['kernel_sdc_safe_counts'].sum() - counts_fp32['kernel_sdc_critical_counts'].sum()}")
print(f"Nr. Masked: {counts_fp32['kernel_masked_counts'].sum()}")
print(f"Nr. SDC-safe: {counts_fp32['kernel_sdc_safe_counts'].sum()}")
print(f"Nr. SDC-critical: {counts_fp32['kernel_sdc_critical_counts'].sum()}")

#print(f"Total: {counts_gp['kernel_error_counts'].sum() + counts_gp['kernel_masked_counts'].sum() + counts_gp['kernel_sdc_safe_counts'].sum() + counts_gp['kernel_sdc_critical_counts'].sum()}")

In [5]:
df_sdc_critical_enhanced_fp32 = dataframes_fp32['df_sdc_critical_enhanced']
df_sdc_critical_enhanced_gp = dataframes_gp['df_sdc_critical_enhanced']

In [9]:
df_sdc_critical_enhanced_fp32.to_excel(f'../spreadsheets/df_sdc_critical/{MACHINE}_{PCA}_{DATASET}_G_FP32.xlsx')
df_sdc_critical_enhanced_gp.to_excel(f'../spreadsheets/df_sdc_critical/{MACHINE}_{PCA}_{DATASET}_G_GP.xlsx')

In [None]:
df_sdc_critical_enhanced_fp32[['icount', 'jaccard_similarity', 'logits_degradation', 'accuracy_drop']]

In [None]:
df_sdc_critical_enhanced_gp[['icount', 'jaccard_similarity', 'logits_degradation', 'accuracy_drop']]

# Convert complete SDC-critical pandas dataframes into excel sheets

In [None]:
#dataframes = pd.HDFStore(f'./complete_dataframes/{MACHINE}_{PCA}_{DATASET}_G_FP32_dataframes.h5')
#dataframes_gp = pd.HDFStore(f'./complete_dataframes/{MACHINE}_{PCA}_{DATASET}_G_GP_dataframes.h5')

df_sdc_critical_enhanced = {}
for pca in ['PCA10_hardened']:#['PCA7', 'PCA10', 'PCA50']:
    for machine in ['workstation', 'laptop']:
        for dataset in ['indianPines', 'pavia_uni', 'salinas']:

            dataframes_fp32 = pd.HDFStore(f'./complete_dataframes/{machine}_{pca}_{dataset}_G_FP32_dataframes.h5')
            dataframes_gp = pd.HDFStore(f'./complete_dataframes/{machine}_{pca}_{dataset}_G_GP_dataframes.h5')

            dataframes_fp32['df_sdc_critical_enhanced'].to_excel(f'../spreadsheets/df_sdc_critical/{machine}_{pca}_{dataset}_G_FP32.xlsx')
            dataframes_gp['df_sdc_critical_enhanced'].to_excel(f'../spreadsheets/df_sdc_critical/{machine}_{pca}_{dataset}_G_GP.xlsx')
            

# SDC-critical counts per kernel and instruction group

In [4]:
kernel_sdc_critical_table = pd.concat([counts_fp32['kernel_sdc_critical_counts'], counts_gp['kernel_sdc_critical_counts']], axis=1).rename_axis(None, axis=0)

kernel_sdc_critical_table = kernel_sdc_critical_table.rename({'count': 'SDC_critical (G_FP32)', 'count': 'SDC_critical (G_GP)'}, axis=1).fillna(0)

#counts_fp32['kernel_sdc_critical_counts'].rename_axis(None)

Convert dataframe to excel sheet

In [5]:
kernel_sdc_critical_table.to_excel(f'../spreadsheets/{MACHINE}_{PCA}_{DATASET}.xlsx')

# Other details

In [9]:
df_error = store_fp32['df_error']
df_masked = store_fp32['df_masked']
df_sdc_critical = store_fp32['df_sdc_critical']
df_sdc_safe = store_fp32['df_sdc_safe']

In [5]:
df_sdc_critical_fp32 = store_fp32['kernel_sdc_critical_counts']
df_sdc_safe_fp32 = store_fp32['kernel_sdc_safe_counts']
df_masked_fp32 = store_fp32['kernel_masked_counts']
df_error_counts_fp32 = store_fp32['kernel_error_counts']

df_sdc_critical_gp = store_gp['kernel_sdc_critical_counts']
df_sdc_safe_gp = store_gp['kernel_sdc_safe_counts']
df_masked_gp = store_gp['kernel_masked_counts']
df_error_counts_gp = store_gp['kernel_error_counts']

In [None]:
#partial = df_sdc_critical_fp32.add(df_sdc_safe_fp32, fill_value=0)
#kernel_targeted_counts = partial.add(df_masked_fp32, fill_value=0)
#kernel_targeted_counts

partial = df_sdc_critical_gp.add(df_sdc_safe_gp, fill_value=0)
kernel_targeted_counts = partial.add(df_masked_gp, fill_value=0)
kernel_targeted_counts

In [13]:
df_sdc_critical_gp

inspecting
cupy_concatenate                             1
enable_if                                  145
nchwToNhwcKernel                             1
nhwckrsc_nhwc_tilesize128x128x16_stage4      2
voidaxpy_kernel_val                          5
voidgemv2N_kernel                            2
voidgemv2N_kernel_VER3                       2
voidger_kernel                              12
voidimplicit_convolveNd_sgemm                3
voidnrm2_kernel                              1
Name: count, dtype: int64

In [15]:
#df_sdc_critical_fp32
#df_masked_fp32
df_sdc_safe_gp

inspecting
ampere_sgemm_32x32_sliced1x4_tn              3
cupy_concatenate                             9
elementwise_kernel                           2
elementwise_kernel_VER2                     14
enable_if                                   29
nchwToNhwcKernel                            14
nhwcToNchwKernel                             2
nhwckrsc_nhwc_tilesize128x128x16_stage4     14
vectorized_elementwise_kernel                2
voidgemv2N_kernel_VER3                       1
voidimplicit_convolveNd_sgemm              135
voidnrm2_kernel                              1
Name: count, dtype: int64

In [16]:
df_sdc_critical_gp

inspecting
enable_if                                   61
nchwToNhwcKernel                             1
nhwckrsc_nhwc_tilesize128x128x16_stage4      1
voidaxpy_kernel_val                          7
voidgemv2N_kernel                            2
voidgemvNSP_kernel                         133
voidger_kernel                              12
voidimplicit_convolveNd_sgemm                1
Name: count, dtype: int64

In [7]:
import pandas as pd

store = pd.HDFStore(f'./{MACHINE}_{DATASET}_{INSTR_GROUP}_counts.h5')

#kernels
kernel_sdc_critical_counts = store['kernel_sdc_critical_counts']
kernel_sdc_safe_counts = store['kernel_sdc_safe_counts']
kernel_masked_counts = store['kernel_masked_counts']
kernel_error_counts = store['kernel_error_counts']

#opcodes
opcode_sdc_critical_counts = store['opcode_sdc_critical_counts']
opcode_sdc_safe_counts = store['opcode_sdc_safe_counts']
opcode_masked_counts = store['opcode_masked_counts']
opcode_error_counts = store['opcode_error_counts']

#registers
register_sdc_critical_counts = store['register_sdc_critical_counts']
register_sdc_safe_counts = store['register_sdc_safe_counts']
register_masked_counts = store['register_masked_counts']
register_error_counts = store['register_error_counts']

## SDC-critical kernels

In [9]:
kernel_sdc_critical_counts

inspecting
elementwise_kernel_VER1                              1
maxwell_scudnn_128x32_3dconv_fprop_medium_nn_v0      2
maxwell_scudnn_128x32_3dconv_fprop_small_nn_v0       1
voidaxpy_kernel_val                                  2
voidgemv2N_kernel_VER1                             120
voidgemv2N_kernel_VER3                               4
voidgemv2T_kernel_val_VER1                          90
voidger_kernel                                       5
voidscal_kernel_val                                  2
Name: count, dtype: int64

In [8]:
opcode_sdc_critical_counts

opcode
FADD        1
FFMA       45
FMUL        3
IADD       45
IADD32I    13
IMNMX       3
ISCADD     13
LDG        39
LDS        22
LEA        13
MOV         8
S2R         3
SHL         5
VMNMX       1
XMAD       13
Name: count, dtype: int64

In [None]:
#kernel_sdc_critical_counts
#kernel_sdc_safe_counts
#kernel_masked_counts
#kernel_error_counts

opcode_sdc_critical_counts
#opcode_sdc_safe_counts
#opcode_masked_counts
#opcode_error_counts

#register_sdc_critical_counts
#register_sdc_safe_counts
#register_masked_counts
#register_error_counts