In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
from utils import format_node_names

In [2]:
# rewrite the path sections to include all the folders that we have
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_slurm_data = Path('./system_analytics_2024/slurm_data')
folder_path_saving_results = Path('./results')
folder_paths_system_states = [folder_path_slurm_data / path 
                              for path in ['system_states', 'system_states_int4', 'system_states_int5']]

_ = [sorted(list(path.glob("*.txt")))
                            for path in folder_paths_system_states]
all_files = [file for folder in _ for file in folder ]
all_files

[PosixPath('system_analytics_2024/slurm_data/system_states/system_states_1.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_2.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_3.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_4.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_5.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_6.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states/system_states_7.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states_int4/system_states_1.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states_int4/system_states_2.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states_int5/system_states_1.txt'),
 PosixPath('system_analytics_2024/slurm_data/system_states_int5/system_states_2.txt')]

In [3]:

# for file_path in files:
def get_date_for_a_day(file_path):
    dict_list = []
    
    with file_path.open(mode='r') as f:
        f_text = f.read()
    sinfo_day = f_text.split('############################')
    for sample_number in range(1, len(sinfo_day)-1):
        sinfo_sample = sinfo_day[sample_number].split('\n')
        time = sinfo_sample[1]
        for sample_row in range(4, len(sinfo_sample)-1):
            sample_row = np.random.randint(4, len(sinfo_sample)-1)
            node_number = int(sinfo_sample[sample_row].split()[7])
            state = sinfo_sample[sample_row].split()[8]
            node_names = sinfo_sample[sample_row].split()[-1]
            # print(format_node_names(node_names), node_number, state, time)
            d1 = {'node':format_node_names(node_names).split(','), 'time': [time] * node_number, 'state': [state]*node_number}
            dict_list.append(d1)
    df_list = [pd.DataFrame(d1) for d1 in dict_list]
    df = pd.concat(df_list, axis=0)
    df['time'] = pd.to_datetime(df['time'])
    df.sort_values(['node', 'time'], inplace=True)
    return df
 
 


In [4]:
start_time = time.time()
with Pool(10) as pool:  # Initialize the pool with 4 processes
    parallel_results = pool.map(get_date_for_a_day, all_files)  # Submit tasks
parallel_duration = time.time() - start_time

In [5]:
df = pd.concat(parallel_results, axis=0)
df.sort_values(['node', 'time'], inplace=True)
# s = f"Number of rows we expected to have: {len(all_files) * 24 * 60 * 2 * 1548}"

display(df.head(), len(df))
# why is there duplication? It could be the way I am getting data? or sinfo gives copy of the nodes in any case some of the 
# node and time are the same. I think sinfo gives the states for some nodes twice in different partitions.

Unnamed: 0,node,time,state
0,fcn1,2024-11-04 17:03:13,mixed
0,fcn1,2024-11-04 17:03:13,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:03:43,mixed


51542890

In [6]:
df.drop_duplicates(subset=['node', 'time', 'state'], inplace=True)
print(f"Numebr of duplicated rows based on node and time: {df.duplicated(['node', 'time']).sum()}")
df.drop_duplicates(subset=['node', 'time'], inplace=True)
display(df.describe(include='all'), df['state'].value_counts())

Numebr of duplicated rows based on node and time: 2


Unnamed: 0,node,time,state
count,25857720,25857720,25857720
unique,1548,,22
top,tcn11,,allocated
freq,21606,,12719845
mean,,2024-11-08 18:03:34.935267840,
min,,2024-11-04 17:03:13,
25%,,2024-11-06 16:35:28,
50%,,2024-11-08 15:27:42,
75%,,2024-11-10 14:51:42,
max,,2024-11-13 11:33:03,


state
allocated      12719845
idle            8376600
mixed           3945609
reserved         490699
drained          138474
planned          122871
completing        33251
draining          18225
drained*           3674
down*              2547
down               1887
idle*              1841
inval              1100
unknown             626
mixed*              161
draining@           154
allocated*           60
reboot^              54
completing*          17
draining*            16
mixed-                8
reboot                1
Name: count, dtype: int64

In [7]:
""" 
Are we measuring the data regularly? No
"""

df.groupby('node')['time'].diff().value_counts()

time
0 days 00:00:30    13632983
0 days 00:01:00     4268406
0 days 00:00:31     2800408
0 days 00:01:01     1027793
0 days 00:01:30      648708
0 days 00:02:00      487898
0 days 00:01:31      262930
0 days 00:01:02      190222
0 days 00:02:01      154110
0 days 00:01:32      110413
0 days 00:03:00       86904
0 days 00:02:30       62013
0 days 00:00:43       55455
0 days 00:00:17       53497
0 days 00:00:44       52073
0 days 00:00:16       51158
0 days 00:00:40       46647
0 days 00:00:38       44248
0 days 00:00:23       44034
0 days 00:00:37       43809
0 days 00:00:42       42749
0 days 00:00:18       42222
0 days 00:00:20       40575
0 days 00:02:31       39308
0 days 00:03:01       38119
0 days 00:00:22       36210
0 days 00:00:46       32898
0 days 00:00:21       32685
0 days 00:00:15       32211
0 days 00:00:19       31744
0 days 00:00:48       31367
0 days 00:00:41       31253
0 days 00:00:14       30954
0 days 00:00:45       30902
0 days 00:00:47       30450
0 days 00:00:39

In [8]:
# save the data for the EDA part
df.to_parquet(folder_path_slurm_data/'sinfo_cleaned.parquet.gzip', compression='gzip')