In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
from utils import format_node_names

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_slurm_data = Path('/projects/2/prjs1098/system_analytics_2024/slurm_data')
folder_path_saving_results = Path('./results')


folder_paths_system_states = [folder_path_slurm_data / path 
                              for path in ['system_states', 'system_states_int4', 'system_states_int5']]

_ = [sorted(list(path.glob("*.txt")))
                            for path in folder_paths_system_states]
all_files = [file for folder in _ for file in folder ]
all_files

[PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_1.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_10.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_11.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_12.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_13.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_14.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_15.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_16.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states/system_states_17.txt'),
 PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data

In [4]:

# for file_path in files:
def get_date_for_a_day(file_path):
    dict_list = []
    # open the text file
    with file_path.open(mode='r') as f:
        f_text = f.read()
    # get the data in the text file in the form of a list, each element is a text that has the sinfo result in it
    sinfo_day = f_text.split('############################')
    for sinfo_sample in sinfo_day[1:]:
        # split the sinfo result based on new lines
        sinfo_sample = sinfo_sample.split('\n')
        # the fisrt line gives us the time
        time = sinfo_sample[1]
        # go through other lines and find the information that you need
        for sample_row in sinfo_sample[3:-1]:
            # the 7th column in a line is node number, we need this to repaeat the state for the node
            node_number = int(sample_row.split()[7])
            # the state is the 8th column
            state = sample_row.split()[8]
            # the last column is the node names
            node_names = sample_row.split()[-1]
            # we need to process node names, because they are in a starnge shape, we do so and put all the data in a dictionary
            d1 = {'node':format_node_names(node_names).split(','), 'time': [time] * node_number, 'state': [state]*node_number}
            # append it and go back and do this for the next line, if this was the last line then do it for the next element in the list.
            dict_list.append(d1)
            
    # turn this each dictionary to a data frame
    df_list = [pd.DataFrame(d1) for d1 in dict_list]
    # concatenate them
    df = pd.concat(df_list, axis=0)
    # turn the time into pandas date time
    df['time'] = pd.to_datetime(df['time'])
    df.sort_values(['node', 'time'], inplace=True)
    return df
 
 


In [5]:
start_time = time.time()
with Pool(30) as pool:  
    parallel_results = pool.map(get_date_for_a_day, all_files)  # Submit tasks
parallel_duration = time.time() - start_time

In [6]:
df = pd.concat(parallel_results, axis=0)
df.sort_values(['node', 'time'], inplace=True)
display(df.head(), len(df))

Unnamed: 0,node,time,state
0,fcn1,2024-11-04 17:03:13,mixed
0,fcn1,2024-11-04 17:03:13,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:03:43,mixed
0,fcn1,2024-11-04 17:04:13,mixed


326785921

In [7]:
print(f"Numebr of duplicated rows based on node and time and state: {df.duplicated(['node', 'time', 'state']).sum()}")
# why is there duplication? It could be the way I am getting data?
# or sinfo gives copy of the nodes in any case some of the 
# node and time are the same. 
# I think sinfo gives the states for some nodes twice in different partitions.

# we drop the duplicated rows:
df.drop_duplicates(subset=['node', 'time', 'state'], inplace=True)

Numebr of duplicated rows based on node and time and state: 112646306


In [8]:
"""  
Here we check for duplicaton in node and time. This is extremely rare and it means that
a node at a specific time can have two states! This can happen because we are measuring the states with 1 second precision
and through 3 login nodes. 
Is there any?
"""
print(f"Numebr of duplicated rows based on node and time: {df.duplicated(['node', 'time']).sum()}")

# show a sample
display(df[df.duplicated(['node', 'time'], keep=False)].head(n=10))
display(df[df.duplicated(['node', 'time'], keep=False)]['node'].value_counts())

Numebr of duplicated rows based on node and time: 231


Unnamed: 0,node,time,state
1,fcn67,2025-01-05 03:03:39,completing
37,fcn67,2025-01-05 03:03:39,idle
0,fcn75,2024-12-23 16:21:05,completing
1,fcn75,2024-12-23 16:21:05,mixed
0,gcn103,2025-01-15 13:16:48,draining
2,gcn103,2025-01-15 13:16:48,completing
1,gcn117,2024-11-15 02:59:39,completing
0,gcn117,2024-11-15 02:59:39,draining
0,gcn117,2024-12-18 13:51:02,completing
12,gcn117,2024-12-18 13:51:02,mixed


node
gcn2       12
gcn58      10
gcn3        8
tcn1182     8
tcn289      6
srv9        4
tcn332      4
tcn1109     4
tcn311      4
gcn46       4
gcn4        4
gcn34       4
tcn292      4
gcn117      4
gcn26       4
gcn20       4
tcn190      4
tcn1143     4
tcn1178     4
tcn1177     4
tcn1137     4
tcn1176     4
tcn1104     4
tcn242      4
tcn356      2
tcn431      2
tcn426      2
tcn359      2
tcn365      2
tcn383      2
tcn409      2
tcn389      2
tcn390      2
tcn398      2
tcn403      2
tcn399      2
tcn402      2
tcn342      2
tcn26       2
tcn260      2
tcn264      2
tcn269      2
tcn270      2
tcn273      2
tcn277      2
tcn278      2
tcn279      2
tcn286      2
tcn314      2
tcn294      2
tcn295      2
tcn306      2
tcn309      2
tcn436      2
tcn324      2
tcn328      2
tcn330      2
tcn335      2
tcn340      2
tcn291      2
tcn695      2
tcn7        2
tcn708      2
tcn709      2
tcn713      2
tcn728      2
tcn732      2
tcn737      2
gcn128      2
gcn129      2
tcn533      2
g

In [9]:
""" 
It is not clear what to do with these samples. There are only a few instances.
We keep the first occurence and delete others.
"""
df.drop_duplicates(subset=['node', 'time'], inplace=True)

# give a description of the collected data. How many samples, how many states, nodes, minimum time maximum time ... 
display(df.describe(include='all'), df['state'].value_counts())

Unnamed: 0,node,time,state
count,214139384,214139384,214139384
unique,1548,,29
top,tcn999,,idle
freq,138346,,105290067
mean,,2024-12-11 18:39:55.518839552,
min,,2024-11-04 17:03:13,
25%,,2024-11-20 11:07:58,
50%,,2024-12-11 09:05:14,
75%,,2025-01-01 03:04:36,
max,,2025-01-22 14:01:22,


state
idle           105290067
allocated       69963707
mixed           29520851
reserved         4316291
draining@        2052621
drained          1253294
planned           962486
completing        284489
drained*          226494
draining          109564
down*              89002
reboot^            27779
idle*              15608
down$              14049
unknown             5079
down                2676
inval               1728
maint               1499
mixed*               639
reboot               431
allocated*           352
maint*               316
completing*          225
mixed-                58
draining*             41
reboot*               18
mixed$                15
allocated+             3
unknown*               2
Name: count, dtype: int64

In [10]:
""" 
Are we measuring the data regularly? No
Here we compute the time difference for our measeruments node specific. 

state(node=node1, time=t2) - state(node=node1, time=t1) = Delta t 


what is the max and min for Delta t?
Is this acceptable? or no it makes the analysis difficult?
"""

display(df.groupby(['node'], as_index=False)[['time']].diff().describe(percentiles=[0.25, 0.5, 0.75, 0.9996]))

""" 
The statistic for Delta t shows that a large portion of the measurements have time differnce less than 3 minutes.
But it seems that at some point in our measurment process we did not record states for some nodes for around 2 hours.
This could our measurement faults or even the case that a specific nodes did not appear in the sinfo.
What are those nodes? are they srv nodes?==> No idea
"""

Unnamed: 0,time
count,214137836
mean,0 days 00:00:49.263309880
std,0 days 00:01:00.739861026
min,0 days 00:00:01
25%,0 days 00:00:30
50%,0 days 00:00:42
75%,0 days 00:01:09
99.96%,0 days 00:02:04
max,6 days 01:30:30


' \nThe statistic for Delta t shows that a large portion of the measurements have time differnce less than 3 minutes.\nBut it seems that at some point in our measurment process we did not record states for some nodes for around 2 hours.\nThis could our measurement faults or even the case that a specific nodes did not appear in the sinfo.\nWhat are those nodes? are they srv nodes?==> No idea\n'

In [11]:
"""  
How many times do we have this long pauses in the measurements?
"""
df_delta_counts = df.groupby(['node'], as_index=False)[['time']].diff().value_counts()
df_delta_counts.sort_index().tail(n=20)

time           
0 days 00:04:50    1548
0 days 00:05:02    1548
0 days 00:05:03    1548
0 days 00:05:05    1548
0 days 00:05:41    1548
0 days 00:05:57    1548
0 days 00:06:04    1548
0 days 00:06:29    1548
0 days 00:06:52    1548
0 days 00:06:54    1548
0 days 00:07:00    1548
0 days 00:07:03    1548
0 days 00:08:35    1548
0 days 00:08:49    1548
0 days 00:09:53    1548
0 days 00:12:23    1548
0 days 00:12:52    1548
0 days 00:14:09    1548
0 days 01:16:38    1548
6 days 01:30:30       2
Name: count, dtype: int64

In [12]:
# save the data a and go to EDA notebook for further analysis
time_formated = pd.Timestamp(time.time(), unit='s').strftime('%Y-%m-%d')
df.to_parquet(folder_path_slurm_data/f"sinfo_cleaned_{time_formated}.parquet.gzip", compression='gzip')