In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
from utils import format_node_names

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_system_states = Path('./system_analytics_2024/slurm_data/system_states')
folder_path_slurm_data = Path('./system_analytics_2024/slurm_data')
folder_path_saving_results = Path('./results')
files = sorted(list(folder_path_system_states.glob("*.txt")))[0:-1]
files

[PosixPath('system_analytics_2024/slurm_data/system_states/system_states_1.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_2.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_3.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_4.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_5.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_6.txt')]

In [3]:

# for file_path in files:
def get_date_for_a_day(file_path):
    dict_list = []
    
    with file_path.open(mode='r') as f:
        f_text = f.read()
    sinfo_day = f_text.split('############################')
    for sample_number in range(1, len(sinfo_day)-1):
        sinfo_sample = sinfo_day[sample_number].split('\n')
        time = sinfo_sample[1]
        for sample_row in range(4, len(sinfo_sample)-1):
            sample_row = np.random.randint(4, len(sinfo_sample)-1)
            node_number = int(sinfo_sample[sample_row].split()[7])
            state = sinfo_sample[sample_row].split()[8]
            node_names = sinfo_sample[sample_row].split()[-1]
            # print(format_node_names(node_names), node_number, state, time)
            d1 = {'node':format_node_names(node_names).split(','), 'time': [time] * node_number, 'state': [state]*node_number}
            dict_list.append(d1)
    df_list = [pd.DataFrame(d1) for d1 in dict_list]
    df = pd.concat(df_list, axis=0)
    df['time'] = pd.to_datetime(df['time'])
    df.sort_values(['node', 'time'], inplace=True)
    return df
 
 


In [4]:
start_time = time.time()
with Pool(10) as pool:  # Initialize the pool with 4 processes
    parallel_results = pool.map(get_date_for_a_day, files)  # Submit tasks
parallel_duration = time.time() - start_time

In [5]:
df = pd.concat(parallel_results, axis=0)
df.sort_values(['node', 'time'], inplace=True)
s = f"Number of rows we expected to have: {len(files) * 24 * 60 * 2 * 1548}"

display(df.head(), len(df), s)
# why is there duplication? It could be the way I am getting data? or sinfo gives copy of the nodes in any case some of the 
# node and time are the same. I think sinfo gives the states for some nodes twice in different partitions.

Unnamed: 0,node,time,state
0,fcn1,2024-11-04 17:03:13,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:04:13,mixed
0,fcn1,2024-11-04 17:04:13,mixed


39630664

'Number of rows we expected to have: 26749440'

In [6]:
df.drop_duplicates(subset=['node', 'time', 'state'], inplace=True)
print(f"Numebr of duplicated rows based on node and time: {df.duplicated(['node', 'time']).sum()}")
display(df.describe(include='all'), df['state'].value_counts())

Numebr of duplicated rows based on node and time: 0


Unnamed: 0,node,time,state
count,19816906,19816906,19816906
unique,1548,,20
top,gcn48,,allocated
freq,16543,,10044035
mean,,2024-11-07 17:34:31.240479744,
min,,2024-11-04 17:03:13,
25%,,2024-11-06 05:24:51,
50%,,2024-11-07 17:34:56,
75%,,2024-11-09 05:48:34,
max,,2024-11-10 17:54:36,


state
allocated      10044035
idle            6366069
mixed           2875194
reserved         385796
drained           76392
planned           32005
completing        19889
draining          10752
drained*           3413
down*              1291
inval              1056
unknown             491
idle*               294
mixed*               92
allocated*           54
reboot^              33
down                 17
draining*            15
completing*           9
mixed-                9
Name: count, dtype: int64

In [7]:
""" 
Are we measuring the data regularly? 
For the time differences that that multiplication of 30 seconds, it is difficult to say that this delay is because of
the absence of the node in the sinfo command or because of the delay in measurement.
For the time differences that are not exact multiplication of 30 seconds then it is likely that this error is due to
data measuremeant.

-- The frequencies that are exact multiplication of 30 seconds also explain why there is a difference between
the number of rows that we excpect and what we get. Some nodes are simply absent from the sinfo for some measurement(it could be all of them).
"""

df.groupby('node')['time'].diff().value_counts()

time
0 days 00:00:30    12347093
0 days 00:00:31     2685778
0 days 00:01:00     2327416
0 days 00:01:01      827238
0 days 00:01:30      571694
0 days 00:01:31      248289
0 days 00:01:02      186819
0 days 00:02:00      179731
0 days 00:01:32      108477
0 days 00:02:01       81318
0 days 00:02:30       51775
0 days 00:02:31       37463
0 days 00:02:02       24225
0 days 00:02:03       24119
0 days 00:03:00       19388
0 days 00:01:33       14074
0 days 00:03:01       12076
0 days 00:00:32        9871
0 days 00:02:34        7503
0 days 00:03:30        5555
0 days 00:02:33        4897
0 days 00:03:31        4325
0 days 00:02:32        3899
0 days 00:04:01        2725
0 days 00:02:04        2654
0 days 00:03:02        2443
0 days 00:00:33        2362
0 days 00:04:00        2012
0 days 00:03:04        1694
0 days 00:03:32        1604
0 days 00:00:35        1482
0 days 00:03:03        1391
0 days 00:01:03        1085
0 days 00:00:56        1074
0 days 00:04:02         823
0 days 00:04:31

In [None]:
# add node_type and removing not workers node
df['node_type'] = df['node'].str[0:3]
df = df[df['node_type'].isin(['fcn', 'gcn', 'tcn', 'hcn'])].copy()
# put the time in an intervale
# df['time_5min_rounded'] = (df['time'] - pd.Timedelta('2.5min')).dt.round(freq='5min')
df['time_30min_interval'] = (df['time'] - pd.Timedelta('15min')).dt.round(freq='30min')
df['time_1hour_interval'] = (df['time'] - pd.Timedelta('30min')).dt.round(freq='h')
df['time_3hour_interval'] = (df['time'] - pd.Timedelta('1.5h')).dt.round(freq='3h')
df['time_6hour_interval'] = (df['time'] - pd.Timedelta('3h')).dt.round(freq='6h')
df['time_day_interval'] = (df['time'] - pd.Timedelta('12h')).dt.round(freq='d')
# get a sample
display(df.sample(n=10))
# save the data
# df.to_parquet(folder_path_slurm_data/'sinfo_cleaned.parquet.gzip', compression='gzip')