In [1]:
import sys
import os
import datetime
import re
from pathlib import Path
from argparse import ArgumentParser
# modin does not work here due to certain df functions
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tinyhtml import html, h, frag, raw

In [2]:
def memstr_to_mebibyte(memstr: str) -> float:
    GIG = 1024.
    retval = 0.
    if memstr[-1] == 'm' or memstr[-1] == 'M':
        retval = float(memstr[:-1])
    elif memstr[-1] == 'g' or memstr[-1] == 'G':
        retval = float(memstr[:-1]) * GIG

    return retval

In [3]:
def parse_categories(cat):
    debug_p = False

    # takes the categories string and returns a dict in order to create new columns in the dataframe
    # we are interested only in the "-l bla bla" part

    # convert memory values to MiB
    # e.g. "-U non-deadlineusers -u zhanq -l A40=TRUE,gpu=1,gpu_A40=TRUE,h_stack=256m,h_vmem=4g,tmpfree=4g -pe threaded 32-48"
    #      -> {'gpu': 1, 'gpu_type': 'a40', 'h_stack': 256, 'h_vmem': 4*1024, 'tmpfree': 4*1024}

    if debug_p:
        print(f'DEBUG: parse_categories: type(cat) = {type(cat)}')
        print(f'DEBUG: parse_categories: cat = {cat}')

    resource_pat = re.compile(r'.*-l\ (\S*).*')
    if resource_pat.match(cat):
        resources = resource_pat.match(cat).groups(1)[0]
        if debug_p:
            print(f'DEBUG: parse_categories: resources = {resources}')
    else:
        #print(f'WARN: parse_categories: no resource request: cat = {cat}')
        #print()
        return None

    # dict to be returned
    retdict = {}

    # GPU types (complexes): A40, A100, P100, V100
    gpu_types = set(['a40', 'a100', 'p100', 'v100'])

    # we ignore the soft limits s_vmem, s_rt
    mem_types = set(['h_stack', 'h_vmem', 'tmpfree'])
    for tok in resources.split(','):
        key, val = tok.split('=')
        if key.lower() == 'gpu':
            # at some point, this became a boolean, then switched back to int
            retkey = key.lower()
            if val == 'TRUE':
                retval = 1
            else:
                retval = int(val)
        elif key.lower() in gpu_types:
            retkey = 'gpu_type'
            retval = key.lower()
        elif key.lower() in mem_types:
            retkey = key.lower()
            retval = memstr_to_mebibyte(val)
        elif key.lower() == 'hostname':
            # the accounting data already has a column "hostname" which is
            # the host allocated to the job
            retkey = 'requested_hostnames'
            retval = val
        elif key.lower() == 'h_rt':
            retkey = key.lower()
            # we have no time limit, so h_rt can be the string "INFINITY"
            # use 64-bit unsigned int
            if val.lower() == 'infinity':
                retval = np.iinfo(np.int64).max
            else:
                retval = np.int64(val)
        elif key.lower() == 'sgx':
            retkey = key.lower()
            retval = bool(val.lower() == 'true')
        else:
            retkey = None
            retval = None

        if retkey and retval:
            retdict[retkey] = retval
        else:
            continue

    # deal with missing values
    if 'h_rt' not in retdict:
        retdict['h_rt'] = np.iinfo(np.int64).max

    if 'gpu' not in retdict:
        retdict['gpu'] = 0
        retdict['gpu_type'] = None
    else:
        # gpu in retdict but gpu_type not specified
        if 'gpu_type' not in retdict:
            retdict['gpu_type'] = None

    if 'h_stack' not in retdict:
        retdict['h_stack'] = None

    if 'h_vmem' not in retdict:
        retdict['h_vmem'] = None

    if 'tmpfree' not in retdict:
        retdict['tmpfree'] = None

    if 'requested_hostnames' not in retdict:
        retdict['requested_hostnames'] = None

    if 'sgx' not in retdict:
        retdict['sgx'] = False

    if debug_p:
        print(f'DEBUG: parse_categories: retdict = {retdict}')

    return retdict

In [4]:
def prep_accounting(sgeacct_df, debug_p: bool):
    debug_p = False
    info_p = False
    # expand the df with stuff from the "-l resources_list"

    resources = sgeacct_df['category'].apply(parse_categories)

    if info_p:
        print(f'INFO: prep_accounting: type(resources) = {type(resources)}')

    #resources_df = pd.DataFrame(resources.tolist(), index=resources.index)
    resources_df = pd.DataFrame.from_records(resources.values, index=resources.index)

    if info_p:
        print(f'INFO: prep_accounting: sgeacct_df.describe() = \n{sgeacct_df.describe()}')
        print(f'INFO: prep_accounting: sgeacct_df.columns = \n{sgeacct_df.columns}')
        print(f'INFO: prep_accounting: sgeacct_df.head() = \n{sgeacct_df.head()}')

        print(f'INFO: prep_accounting: resources_df.describe() = \n{resources_df.describe()}')
        print(f'INFO: prep_accounting: resources_df.columns = \n{resources_df.columns}')
        print(f'INFO: prep_accounting: resources_df.head() = \n{resources_df.head()}')

    ret_df = pd.concat([sgeacct_df, resources_df], axis=1)

    if info_p:
        print( 'INFO: prep_accounting: after pd.concat()')
        print(f'INFO: prep_accounting: ret_df.columns = \n{ret_df.columns}')
        print(f'INFO: prep_accounting: ret_df.head() = \n{ret_df.head()}')

    return ret_df

In [42]:
acctpostproc = Path('accounting_postprocessed.feather')
sgeacct_df = pd.read_feather(acctpostproc)

In [43]:
# don't need the 'category' column - only useful for checking the accounting_postprocessed file
sgeacct_df.drop('category', axis=1, inplace=True)

In [44]:
sgeacct_df.describe()

Unnamed: 0,job_number,submission_time,start_time,end_time,failed,exit_status,ru_wallclock,ru_utime,slots,cpu,mem,io,iow,maxvmem,wait_time,h_stack,h_vmem,tmpfree,h_rt,gpu
count,14708660.0,14708657,14708657,14708657,14708660.0,14708660.0,14708660.0,14708660.0,14708660.0,14708660.0,14708660.0,14708660.0,14708657.0,14708660.0,14708657,14708660.0,14708660.0,14635500.0,14708660.0,14708660.0
mean,5072637.0,2023-10-02 11:24:32.969117184,2023-10-02 13:55:10.390292480,2023-10-02 14:32:58.086723584,1.143062,4.623122,2250.353,2143.146,1.177828,2795.095,38558.93,37.5834,0.0,3409469000.0,0 days 02:30:37.421171627,256.547,21799.49,7833.603,9.014113e+18,0.004254093
min,1.0,2023-01-01 06:34:50,2023-01-01 06:45:40,2023-01-01 06:54:11,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0 days 00:00:01,8.0,800.0,10.0,1.0,0.0
25%,3423273.0,2023-07-17 06:10:40,2023-07-17 07:09:46,2023-07-17 07:15:15,0.0,0.0,26.0,15.88462,1.0,21.09288,1.075428,0.143857,0.0,278757400.0,0 days 00:00:37,256.0,8192.0,4096.0,9.223372e+18,0.0
50%,5060861.0,2023-11-23 08:11:31,2023-11-23 08:11:48,2023-11-23 08:15:45,0.0,0.0,182.0,149.6613,1.0,171.0985,76.40747,2.3964,0.0,2408378000.0,0 days 00:10:44,256.0,13312.0,4096.0,9.223372e+18,0.0
75%,6892601.0,2023-12-22 09:06:02,2023-12-22 12:37:25,2023-12-22 12:48:01,0.0,0.0,486.0,330.5236,1.0,366.1962,281.6458,10.51077,0.0,3221565000.0,0 days 01:20:30,256.0,28672.0,4096.0,9.223372e+18,0.0
max,9999999.0,2024-03-13 16:15:44,2024-03-13 16:16:00,2024-03-13 16:28:06,100.0,255.0,3309973.0,16636230.0,72.0,16691820.0,17184800000.0,959725.8,0.0,2525975000000.0,43 days 18:55:38,12288.0,3072000.0,307200.0,9.223372e+18,2.0
std,2481260.0,,,,9.50908,26.7107,23462.31,29188.36,1.33756,36506.63,18525160.0,1043.641,0.0,11355260000.0,0 days 12:24:50.209143438,106.7048,25251.67,22918.34,1.373419e+18,0.0674534


In [45]:
# keep successful jobs
sgeacct_df = sgeacct_df[sgeacct_df['failed'] == 0]

In [46]:
sgeacct_df.describe()

Unnamed: 0,job_number,submission_time,start_time,end_time,failed,exit_status,ru_wallclock,ru_utime,slots,cpu,mem,io,iow,maxvmem,wait_time,h_stack,h_vmem,tmpfree,h_rt,gpu
count,14445910.0,14445913,14445913,14445913,14445913.0,14445910.0,14445910.0,14445910.0,14445910.0,14445910.0,14445910.0,14445910.0,14445913.0,14445910.0,14445913,14445910.0,14445910.0,14372990.0,14445910.0,14445910.0
mean,5050893.0,2023-10-03 02:17:26.059517696,2023-10-03 04:47:17.462299392,2023-10-03 05:20:55.099415808,0.0,2.280444,1999.979,2119.222,1.173605,2521.92,36304.17,32.75761,0.0,3347206000.0,0 days 02:29:51.402784095,256.5746,21716.41,7788.233,9.021101e+18,0.0037088
min,1.0,2023-01-01 06:34:50,2023-01-01 06:45:40,2023-01-01 06:54:11,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0 days 00:00:01,8.0,800.0,10.0,60.0,0.0
25%,3408201.0,2023-07-17 07:50:18,2023-07-17 10:01:47,2023-07-17 10:07:03,0.0,0.0,25.0,16.43051,1.0,20.6906,1.029109,0.143629,0.0,270348300.0,0 days 00:00:36,256.0,8192.0,4096.0,9.223372e+18,0.0
50%,5052308.0,2023-11-23 13:20:26,2023-11-23 13:20:48,2023-11-23 13:24:43,0.0,0.0,182.0,149.6135,1.0,170.2664,82.30061,2.303815,0.0,2408186000.0,0 days 00:10:48,256.0,12288.0,4096.0,9.223372e+18,0.0
75%,6863502.0,2023-12-22 12:15:31,2023-12-22 16:13:06,2023-12-22 16:21:22,0.0,0.0,455.0,325.1468,1.0,352.4831,279.2719,9.871756,0.0,3218256000.0,0 days 01:22:39,256.0,28672.0,4096.0,9.223372e+18,0.0
max,9999999.0,2024-03-13 16:15:44,2024-03-13 16:16:00,2024-03-13 16:28:06,0.0,255.0,2615081.0,16636230.0,72.0,16691820.0,17184800000.0,959725.8,0.0,2525975000000.0,42 days 05:58:33,12288.0,3072000.0,307200.0,9.223372e+18,2.0
std,2475713.0,,,,0.0,20.36583,21620.91,29179.78,1.308033,32995.64,18674540.0,957.4569,0.0,9819477000.0,0 days 12:20:37.136821304,107.6503,24859.88,22684.0,1.350816e+18,0.06296619


In [47]:
sgeacct_df.head(5)

Unnamed: 0,qname,hostname,owner,job_number,submission_time,start_time,end_time,failed,exit_status,ru_wallclock,...,maxvmem,wait_time,h_stack,h_vmem,tmpfree,h_rt,gpu,gpu_type,requested_hostnames,sgx
0,all.q,2118ffn004.bicic.local,buckovab,3084282,2023-01-01 06:43:03,2023-01-01 06:53:50,2023-01-01 06:54:11,0,0,21,...,639201280.0,0 days 00:10:47,256.0,204800.0,4096.0,9223372036854775807,0,,,False
1,all.q,2119fmn015.bicic.local,hwangg,3084293,2023-01-01 07:23:09,2023-01-01 07:33:31,2023-01-01 07:33:49,0,0,18,...,343056384.0,0 days 00:10:22,256.0,8192.0,10.0,172800,0,,,False
2,all.q,2119fmn021.bicic.local,hwangg,3084290,2023-01-01 07:23:04,2023-01-01 07:33:31,2023-01-01 07:33:50,0,0,19,...,342974464.0,0 days 00:10:27,256.0,8192.0,10.0,172800,0,,,False
3,all.q,2118ffn010.bicic.local,hwangg,3084292,2023-01-01 07:23:07,2023-01-01 07:33:31,2023-01-01 07:33:50,0,0,19,...,343171072.0,0 days 00:10:24,256.0,8192.0,10.0,172800,0,,,False
4,all.q,2118ffn009.bicic.local,hwangg,3084294,2023-01-01 07:23:10,2023-01-01 07:33:31,2023-01-01 07:33:50,0,0,19,...,343048192.0,0 days 00:10:21,256.0,8192.0,10.0,172800,0,,,False


In [48]:
sgeacct_df['ru_wallclock'] = pd.to_timedelta(sgeacct_df['ru_wallclock'], unit='s')
sgeacct_df['ru_utime'] = pd.to_timedelta(sgeacct_df['ru_utime'], unit='s')

In [49]:
gpu_a100_jobs_df = sgeacct_df[sgeacct_df['gpu_type'] == 'a100']

In [50]:
gpu_a100_jobs_df.describe()

Unnamed: 0,job_number,submission_time,start_time,end_time,failed,exit_status,ru_wallclock,ru_utime,slots,cpu,mem,io,iow,maxvmem,wait_time,h_stack,h_vmem,tmpfree,h_rt,gpu
count,1688.0,1688,1688,1688,1688.0,1688.0,1688,1688,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0,1688,1688.0,1688.0,1688.0,1688.0,1688.0
mean,7590149.0,2023-10-22 04:51:52.483412480,2023-10-22 21:56:17.998222848,2023-10-23 02:10:25.626184960,0.0,3.332938,0 days 04:13:59.781990521,0 days 04:48:39.385040943,3.352488,21302.51,127160.6,1612.7101,0.0,11213880000.0,0 days 17:04:25.514810426,256.0,68071.127962,4096.0,4.835713e+18,1.280213
min,203708.0,2023-04-10 18:39:29,2023-04-10 18:43:35,2023-04-10 18:45:06,0.0,0.0,0 days 00:00:00,0 days 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0 days 00:00:02,256.0,1024.0,4096.0,180.0,1.0
25%,5871737.0,2023-09-22 19:50:55,2023-09-25 05:24:12,2023-09-25 06:04:32.750000128,0.0,0.0,0 days 00:01:27.750000,0 days 00:00:15.396341,1.0,24.31618,22.93798,0.044772,0.0,3967293000.0,0 days 00:00:23,256.0,21504.0,4096.0,600.0,1.0
50%,8817318.0,2023-10-23 15:13:05,2023-10-24 02:39:47,2023-10-24 02:45:23,0.0,0.0,0 days 00:05:35,0 days 00:00:16.169686,1.0,26.65602,37.44443,0.178834,0.0,4995402000.0,0 days 00:04:40.500000,256.0,98304.0,4096.0,9.223372e+18,1.0
75%,9316132.0,2023-11-21 16:13:06,2023-11-21 16:13:27.500000,2023-11-21 16:14:21.750000128,0.0,1.0,0 days 00:06:13.250000,0 days 00:02:38.048570750,8.0,206.2471,1034.759,7.471066,0.0,11444810000.0,0 days 09:17:15,256.0,98304.0,4096.0,9.223372e+18,2.0
max,9914222.0,2024-03-12 18:35:40,2024-03-12 18:39:00,2024-03-12 20:15:09,0.0,255.0,19 days 09:36:20,136 days 00:03:58.397622,64.0,12143000.0,14441530.0,959725.799626,0.0,312318600000.0,36 days 06:01:35,256.0,425984.0,4096.0,9.223372e+18,2.0
std,2543896.0,,,,0.0,20.633863,0 days 23:05:22.441957029,3 days 09:23:09.393485792,3.570218,309426.3,835473.2,32045.479934,0.0,19719720000.0,2 days 04:56:01.967428716,0.0,57029.543795,0.0,4.607606e+18,0.449236


In [51]:
gpu_a100_jobs_df.head(5)

Unnamed: 0,qname,hostname,owner,job_number,submission_time,start_time,end_time,failed,exit_status,ru_wallclock,...,maxvmem,wait_time,h_stack,h_vmem,tmpfree,h_rt,gpu,gpu_type,requested_hostnames,sgx
1637971,all.q,2119ga002.bicic.local,bergman,4401088,2023-04-10 18:39:29,2023-04-10 18:43:35,2023-04-10 18:45:06,0,0,0 days 00:01:31,...,47964160.0,0 days 00:04:06,256.0,4096.0,4096.0,420,1,a100,,False
1637983,all.q,2119ga002.bicic.local,bergman,4401089,2023-04-10 18:39:30,2023-04-10 18:45:35,2023-04-10 18:47:25,0,0,0 days 00:01:50,...,63643648.0,0 days 00:06:05,256.0,4096.0,4096.0,420,1,a100,,False
1637990,all.q,2119ga002.bicic.local,bergman,4401090,2023-04-10 18:39:30,2023-04-10 18:47:35,2023-04-10 18:48:25,0,0,0 days 00:00:50,...,53370880.0,0 days 00:08:05,256.0,4096.0,4096.0,420,1,a100,,False
1638001,all.q,2119ga002.bicic.local,bergman,4401091,2023-04-10 18:39:30,2023-04-10 18:48:34,2023-04-10 18:50:05,0,0,0 days 00:01:31,...,63639552.0,0 days 00:09:04,256.0,4096.0,4096.0,420,1,a100,,False
1638008,all.q,2119ga002.bicic.local,bergman,4401092,2023-04-10 18:39:30,2023-04-10 18:50:34,2023-04-10 18:51:45,0,0,0 days 00:01:11,...,62283776.0,0 days 00:11:04,256.0,4096.0,4096.0,420,1,a100,,False


In [52]:
gpu_a100_jobs_df['ru_wallclock'].describe()

count                         1688
mean     0 days 04:13:59.781990521
std      0 days 23:05:22.441957029
min                0 days 00:00:00
25%         0 days 00:01:27.750000
50%                0 days 00:05:35
75%         0 days 00:06:13.250000
max               19 days 09:36:20
Name: ru_wallclock, dtype: object

In [53]:
gpu_a40_jobs_df = sgeacct_df[sgeacct_df['gpu_type'] == 'a40']

In [54]:
gpu_a40_jobs_df['ru_wallclock'].describe()

count                         3474
mean     0 days 09:24:35.046344271
std      1 days 06:23:12.234830303
min                0 days 00:00:00
25%         0 days 00:01:15.250000
50%                0 days 00:08:19
75%         0 days 13:36:00.750000
max               25 days 01:15:02
Name: ru_wallclock, dtype: object