In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
from utils import format_node_names

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_slurm_data = Path('./system_analytics_2024/slurm_data')
folder_path_saving_results = Path('./results')


# for getting the up-to-date data run the clenaing_sinfo file
# with the latest data from the system.
df = pd.read_parquet(folder_path_slurm_data / 'sinfo_cleaned.parquet.gzip')
df.sort_values(['node', 'time'], inplace=True)
df.sample(n=10)

Unnamed: 0,node,time,state,node_type,time_30min_interval,time_1hour_interval,time_3hour_interval,time_6hour_interval,time_day_interval
41,fcn61,2024-11-05 15:03:36,idle,fcn,2024-11-05 15:00:00,2024-11-05 15:00:00,2024-11-05 15:00:00,2024-11-05 12:00:00,2024-11-05
12,gcn28,2024-11-05 19:57:32,allocated,gcn,2024-11-05 19:30:00,2024-11-05 19:00:00,2024-11-05 18:00:00,2024-11-05 18:00:00,2024-11-05
322,tcn1077,2024-11-10 09:04:22,idle,tcn,2024-11-10 09:00:00,2024-11-10 09:00:00,2024-11-10 09:00:00,2024-11-10 06:00:00,2024-11-10
21,tcn74,2024-11-07 00:15:29,mixed,tcn,2024-11-07 00:00:00,2024-11-07 00:00:00,2024-11-07 00:00:00,2024-11-07 00:00:00,2024-11-07
115,tcn870,2024-11-07 18:32:09,idle,tcn,2024-11-07 18:30:00,2024-11-07 18:00:00,2024-11-07 18:00:00,2024-11-07 18:00:00,2024-11-07
0,gcn7,2024-11-09 21:54:33,mixed,gcn,2024-11-09 21:30:00,2024-11-09 21:00:00,2024-11-09 21:00:00,2024-11-09 18:00:00,2024-11-09
32,tcn50,2024-11-07 18:29:08,allocated,tcn,2024-11-07 18:00:00,2024-11-07 18:00:00,2024-11-07 18:00:00,2024-11-07 18:00:00,2024-11-07
396,tcn493,2024-11-07 04:06:33,allocated,tcn,2024-11-07 04:00:00,2024-11-07 04:00:00,2024-11-07 03:00:00,2024-11-07 00:00:00,2024-11-07
280,tcn367,2024-11-08 00:02:37,allocated,tcn,2024-11-08 00:00:00,2024-11-08 00:00:00,2024-11-08 00:00:00,2024-11-08 00:00:00,2024-11-08
29,tcn46,2024-11-07 11:25:03,allocated,tcn,2024-11-07 11:00:00,2024-11-07 11:00:00,2024-11-07 09:00:00,2024-11-07 06:00:00,2024-11-07


In [None]:

"""  
See the EDA_sinfo for this block
"""

# the time interval that we put all the states in it.
time_col = 'time_1hour_interval'
map_time_col = {'time_1hour_interval':pd.Timedelta('1h'),
                'time_30min_interval':pd.Timedelta('30min')}

df_temp = df.groupby(['node',time_col], as_index=False)['state'].value_counts(normalize=True)
df_temp.sort_values(['node', time_col], inplace=True)

# get the idle state only
df_idle = df_temp[(df_temp['state']=='idle')].copy()
df_idle.drop(columns='state', inplace=True)
df_idle.rename(columns={'proportion':'idle'}, inplace=True)
# get the not idle states and add their shares together and count it as not_idle!
df_not_idle = df_temp[(df_temp['state']!='idle')].copy()
df_not_idle = df_not_idle.groupby(['node', time_col], as_index=False)['proportion'].sum().copy()
df_not_idle.rename(columns={'proportion':'not_idle'}, inplace=True)
# join the two data frame based on node and time
df_stat = pd.merge(df_idle, df_not_idle, how='outer', on=['node', time_col])
df_stat.fillna(value=0, inplace=True)
df_stat.sort_values(['node', time_col], inplace=True)

# display(df_idle[df_idle.node == node_name].head(),
#         df_not_idle[df_not_idle.node == node_name].head(n=20), df_stat[df_stat.node == node_name].head(n=20),) 



df_stat.drop(columns=['not_idle'], inplace=True)
df_stat.rename(columns={'idle':'idle_value'}, inplace=True)
# add palce holder for the state and last_time
# df_stat['last_state'] = None
# df_stat['last_time'] = None
df_stat.head()





Unnamed: 0,node,time_1hour_interval,idle_value
0,fcn1,2024-11-04 17:00:00,0.0
1,fcn1,2024-11-04 18:00:00,0.0
2,fcn1,2024-11-04 19:00:00,0.0
3,fcn1,2024-11-04 20:00:00,0.0
4,fcn1,2024-11-04 21:00:00,0.0


In [None]:
# get the last state of the node before the interval
""" 
note the trick here!
"""

df_last = df.groupby(['node', time_col], as_index=False)[['node', time_col, 'state', 'time']].tail(1).copy()

# shift the time without making assumption about the next row
df_last[time_col] = (df_last[time_col] + map_time_col[time_col]).copy()
df_last.head()

Unnamed: 0,node,time_1hour_interval,state,time
0,fcn1,2024-11-04 18:00:00,mixed,2024-11-04 17:59:51
0,fcn1,2024-11-04 19:00:00,mixed,2024-11-04 18:59:01
0,fcn1,2024-11-04 20:00:00,mixed,2024-11-04 19:59:37
0,fcn1,2024-11-04 21:00:00,mixed,2024-11-04 20:59:41
0,fcn1,2024-11-04 22:00:00,mixed,2024-11-04 21:59:31


In [None]:

# df_last['last_state'] = df_last.groupby('node')['state'].shift(1).copy()
# df_last['time_for_last_state'] = df_last.groupby('node')['time'].shift(1).copy()


df_stat = pd.merge(df_stat, df_last[['node', time_col, 'state', 'time']],
                   how='left', on=['node', time_col]).copy()
df_stat.rename(columns={'time':'time_for_last_state', 'state': 'last_state'}, inplace=True)

In [None]:
df_stat[df_stat['node'].isin(['fcn113', 'fcn114'])].head()


Unnamed: 0,node,time_1hour_interval,idle_value,last_state,time_for_last_state
2320,fcn113,2024-11-04 17:00:00,1.0,,NaT
2321,fcn113,2024-11-04 18:00:00,1.0,idle,2024-11-04 17:59:21
2322,fcn113,2024-11-04 19:00:00,1.0,idle,2024-11-04 18:59:01
2323,fcn113,2024-11-04 20:00:00,1.0,idle,2024-11-04 19:59:37
2324,fcn113,2024-11-04 21:00:00,1.0,idle,2024-11-04 20:59:41


In [None]:

# df_stat.sample(n=10)


In [None]:
# # minues one second to ensure that ther is no leak in the data
# df_stat ['end_inteval'] = (df_stat[time_col] + map_time_col[time_col]) - pd.Timedelta('1s')



# def get_data_df_stat_df(interval):
#     df_list = []
#     for i in range(interval[0], interval[1]):
#         # try chaning this
#         node_name = df_stat.iloc[i, 0]
#         start_time = df_stat.iloc[i, 1]
#         end_time = df_stat.iloc[i, 3]

#         idx = pd.IndexSlice
#         try:
#             x = df.loc[idx[node_name, start_time:end_time], :].iloc[[-1]].copy()
#             if len(x)!=0:
#                 # df_stat.iloc[i, 3:5] = [x['state'], x.name[1]]
#                 df_list.append(x)
#             else:
#                 raise KeyError("No data found for the specified time range.")
#         except KeyError as e:
#             print(f"No data found for node {node_name} in the specified time range.")
#         except IndexError as e:
#             print(f"Index error encountered for node {node_name} with start {start_time} and end {end_time}.")
#         except Exception as e:
#             print(f"An unexpected error occurred for node {node_name}: {e}")
    
        
#     print('done')
#     return pd.concat(df_list, axis=0).reset_index()


# chunk_size = 20 # 1000 takes 15 minutes
# num_worker = 20 # 192
# len_df_stat = len(df_stat)
# ranges = np.linspace(start=0, stop=len_df_stat, endpoint=True, 
#                      num=chunk_size, dtype=int).tolist()
# chunked_interval = list(zip(ranges, ranges[1:]))


# with Pool(num_worker) as pool:  
#     parallel_results = pool.map(get_data_df_stat_df, chunked_interval) 


In [None]:


# df_temp = pd.concat(parallel_results, axis=0)
# df_stat = pd.merge(df_stat, df_temp[['node', time_col, 'time', 'state']], how='left', on=['node', time_col]).copy()


# df_stat.iloc[1000:1010, :]


In [None]:
# using the same idea as above and add the promethues data, you can the last two steps
# find the relevant parquet which would

 ## then we are ready to define a mini ML problem, it can be either a classification or regresstion
 # task.
 
 
 

In [None]:
# import multiprocessing

# # Get the number of CPUs available
# num_cpus = 

# print(f"Number of CPUs available: {num_cpus}")
