In [6]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
from utils import format_node_names

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_slurm_data = Path('/projects/2/prjs1098/system_analytics_2024/slurm_data')


folder_paths_system_states = [folder_path_slurm_data / path 
                              for path in ['system_states', 'system_states_int4', 'system_states_int5', '']]

_ = [sorted(list(path.glob("*.txt")))
                            for path in folder_paths_system_states]
all_files = [file for folder in _ for file in folder ]
all_files

[PosixPath('/projects/2/prjs1098/system_analytics_2024/slurm_data/system_states_90.txt')]

In [8]:

# for file_path in files:
def get_date_for_a_day(file_path):
    dict_list = []
    # open the text file
    with file_path.open(mode='r') as f:
        f_text = f.read()
    # get the data in the text file in the form of a list, each element is a text that has the sinfo result in it
    sinfo_day = f_text.split('############################')
    for sinfo_sample in sinfo_day[1:]:
        # split the sinfo result based on new lines
        sinfo_sample = sinfo_sample.split('\n')
        # the fisrt line gives us the time
        time = sinfo_sample[1]
        # go through other lines and find the information that you need
        for sample_row in sinfo_sample[3:-1]:
            # the 7th column in a line is node number, we need this to repaeat the state for the node
            node_number = int(sample_row.split()[7])
            # the state is the 8th column
            state = sample_row.split()[8]
            # the last column is the node names
            node_names = sample_row.split()[-1]
            # we need to process node names, because they are in a starnge shape, we do so and put all the data in a dictionary
            d1 = {'node':format_node_names(node_names).split(','), 'time': [time] * node_number, 'state': [state]*node_number}
            # append it and go back and do this for the next line, if this was the last line then do it for the next element in the list.
            dict_list.append(d1)
            
    # turn this each dictionary to a data frame
    df_list = [pd.DataFrame(d1) for d1 in dict_list]
    # concatenate them
    df = pd.concat(df_list, axis=0)
    # turn the time into pandas date time
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df.sort_values(['node', 'time'], inplace=True)
    return df
 
 


In [9]:
start_time = time.time()
with Pool(30) as pool:  
    parallel_results = pool.map(get_date_for_a_day, all_files)  # Submit tasks
parallel_duration = time.time() - start_time

In [10]:
df = pd.concat(parallel_results, axis=0)
df.sort_values(['node', 'time'], inplace=True)
display(df.head(), len(df))

Unnamed: 0,node,time,state
0,fcn1,2025-02-07 05:25:48,idle
0,fcn1,2025-02-07 05:25:48,idle
0,fcn1,2025-02-07 05:28:49,idle
0,fcn1,2025-02-07 05:28:49,idle
0,fcn1,2025-02-07 05:31:49,idle


1117032

In [11]:
print(f"Numebr of duplicated rows based on node and time and state: {df.duplicated(['node', 'time', 'state']).sum()}")
# why is there duplication? It could be the way I am getting data?
# or sinfo gives copy of the nodes in any case some of the 
# node and time are the same. 
# I think sinfo gives the states for some nodes twice in different partitions.

# we drop the duplicated rows:
df.drop_duplicates(subset=['node', 'time', 'state'], inplace=True)

Numebr of duplicated rows based on node and time and state: 362096


In [12]:
"""  
Here we check for duplicaton in node and time. This is extremely rare and it means that
a node at a specific time can have two states! This can happen because we are measuring the states with 1 second precision
and through 3 login nodes. 
Is there any?
"""
print(f"Numebr of duplicated rows based on node and time: {df.duplicated(['node', 'time']).sum()}")

# show a sample
display(df[df.duplicated(['node', 'time'], keep=False)].head(n=10))
display(df[df.duplicated(['node', 'time'], keep=False)]['node'].value_counts())

Numebr of duplicated rows based on node and time: 0


Unnamed: 0,node,time,state


Series([], Name: count, dtype: int64)

In [13]:
""" 
It is not clear what to do with these samples. There are only a few instances.
We keep the first occurence and delete others.
"""
df.drop_duplicates(subset=['node', 'time'], inplace=True)

# give a description of the collected data. How many samples, how many states, nodes, minimum time maximum time ... 
display(df.describe(include='all'), df['state'].value_counts())

Unnamed: 0,node,time,state
count,754936,754936,754936
unique,1547,,14
top,tcn999,,idle
freq,488,,354383
mean,,2025-02-07 17:36:45.024590592,
min,,2025-02-07 05:25:48,
25%,,2025-02-07 11:31:15,
50%,,2025-02-07 17:36:48,
75%,,2025-02-07 23:42:14,
max,,2025-02-08 05:47:38,


state
idle          354383
allocated     259158
mixed         120482
reserved        7232
drained         5893
planned         3316
completing      1597
down*           1184
draining         846
drained*         835
mixed-             4
idle*              3
draining*          2
mixed*             1
Name: count, dtype: int64

In [14]:
""" 
Are we measuring the data regularly? No
Here we compute the time difference for our measeruments node specific. 

state(node=node1, time=t2) - state(node=node1, time=t1) = Delta t 


what is the max and min for Delta t?
Is this acceptable? or no it makes the analysis difficult?
"""

display(df.groupby(['node'], as_index=False)[['time']].diff().describe(percentiles=[0.25, 0.5, 0.75, 0.9996]))

""" 
The statistic for Delta t shows that a large portion of the measurements have time differnce less than 3 minutes.
But it seems that at some point in our measurment process we did not record states for some nodes for around 2 hours.
This could our measurement faults or even the case that a specific nodes did not appear in the sinfo.
What are those nodes? are they srv nodes?==> No idea
"""

Unnamed: 0,time
count,753389
mean,0 days 00:03:00.102669404
std,0 days 00:00:00.303526803
min,0 days 00:03:00
25%,0 days 00:03:00
50%,0 days 00:03:00
75%,0 days 00:03:00
99.96%,0 days 00:03:01
max,0 days 00:03:01


' \nThe statistic for Delta t shows that a large portion of the measurements have time differnce less than 3 minutes.\nBut it seems that at some point in our measurment process we did not record states for some nodes for around 2 hours.\nThis could our measurement faults or even the case that a specific nodes did not appear in the sinfo.\nWhat are those nodes? are they srv nodes?==> No idea\n'

In [15]:
"""  
How many times do we have this long pauses in the measurements?
"""
df_delta_counts = df.groupby(['node'], as_index=False)[['time']].diff().value_counts()
df_delta_counts.sort_index().tail(n=20)

time           
0 days 00:03:00    676039
0 days 00:03:01     77350
Name: count, dtype: int64

In [16]:
# save the data a and go to EDA notebook for further analysis
time_formated = pd.Timestamp(time.time(), unit='s').strftime('%Y-%m-%d')
# df.to_parquet(folder_path_slurm_data/f"sinfo_cleaned_{time_formated}.parquet.gzip", compression='gzip')