In [25]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from multiprocessing import Pool
import time
from tqdm.notebook import tqdm
import missingno as msno
from utils import get_idle_proportion



from darts import TimeSeries
from darts.models import (
    NaiveSeasonal,
    NaiveMean,
    NaiveDrift,
    ExponentialSmoothing,
    AutoARIMA,
    ARIMA,
    Theta,
    FFT
)

from darts.metrics import mase, mse, mae, ope

In [26]:
"""" 
                                        ***ML BASED ON PURE SINFO DATA: Baseline models***


We would like to set up a ML problem. 

In this problem we would like to predict the proportion of being idle for the next interval for a given node.
This makes the problem a time series problem with regression falvour as we predict a continues value
between 0 and 1.  


Another setup would be to turn this into a classification problem by cutting the interval [0-1] into multiple classes

class low idle: [0-0.3]
class middle idle: [0.3-0.9]
class high idle: [0.9-1]

Then we predict the next step belongs to which class. This is more realistic because differenticating between 
0.91 and 0.92 does not have much value(right?)
"""

# do a simple point prediction just based on the past signals and the timing.
# add the last state as well and see how things improve.
# then get the number of jobs running and the last moment jobs


# Do the same thing with the mind set that we only predict if it will be 100 idle or not.
# this becomes a binary classficiation, but still it is a time series as you have
# respet the temoral of the signals.

'" \n                                        ***ML BASED ON PURE SINFO DATA: Baseline models***\n\n\nWe would like to set up a ML problem. \n\nIn this problem we would like to predict the proportion of being idle for the next interval for a given node.\nThis makes the problem a time series problem with regression falvour as we predict a continues value\nbetween 0 and 1.  \n\n\nAnother setup would be to turn this into a classification problem by cutting the interval [0-1] into multiple classes\n\nclass low idle: [0-0.3]\nclass middle idle: [0.3-0.9]\nclass high idle: [0.9-1]\n\nThen we predict the next step belongs to which class. This is more realistic because differenticating between \n0.91 and 0.92 does not have much value(right?)\n'

In [27]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.precision = 4 # show 4 digits precision
folder_path_slurm_data = Path('/projects/2/prjs1098/system_analytics_2024/slurm_data')
folder_path_prom_data = Path('/projects/2/prjs1098/system_analytics_2024/prom_data')
folder_path_EAR_data = Path('/projects/2/prjs1098/system_analytics_2024/ear_data')

folder_path_saving_results = Path('./results')

all_prom_file_paths = list(folder_path_prom_data.glob("*.gzip"))
# for getting the up-to-date data run the clenaing_sinfo file
# with the latest data from the system.
df = pd.read_parquet(folder_path_slurm_data / 'sinfo_EDA_cleaned.parquet.gzip')
df.sample(n=5)

Unnamed: 0,node,time,state,node_type,time_30min_interval,time_1hour_interval,time_2hour_interval,time_3hour_interval,time_4hour_interval,time_6hour_interval,time_12hour_interval,time_day_interval
263,tcn982,2024-11-18 12:15:24,idle,tcn,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18 12:00:00,2024-11-18
204,tcn1179,2024-11-18 18:28:44,allocated,tcn,2024-11-18 18:00:00,2024-11-18 18:00:00,2024-11-18 18:00:00,2024-11-18 18:00:00,2024-11-18 16:00:00,2024-11-18 18:00:00,2024-11-18 12:00:00,2024-11-18
202,tcn823,2024-11-25 17:05:29,allocated,tcn,2024-11-25 17:00:00,2024-11-25 17:00:00,2024-11-25 16:00:00,2024-11-25 15:00:00,2024-11-25 16:00:00,2024-11-25 12:00:00,2024-11-25 12:00:00,2024-11-25
414,tcn1078,2024-11-17 22:04:04,idle,tcn,2024-11-17 22:00:00,2024-11-17 22:00:00,2024-11-17 22:00:00,2024-11-17 21:00:00,2024-11-17 20:00:00,2024-11-17 18:00:00,2024-11-17 12:00:00,2024-11-17
2,tcn6,2024-11-07 16:46:35,allocated,tcn,2024-11-07 16:30:00,2024-11-07 16:00:00,2024-11-07 16:00:00,2024-11-07 15:00:00,2024-11-07 16:00:00,2024-11-07 12:00:00,2024-11-07 12:00:00,2024-11-07


In [28]:
# WHICH TIME INTEVAL MAKES SENSE FOR ML?
time_col = 'time_4hour_interval'
df_stat, df_idle, df_total = get_idle_proportion(df, time_col)
# show a smaple
initial_data_size= len(df_stat)
display(df_stat.sample(n=5))
print(initial_data_size)
print(f"Is there duplication in node and time: {df_stat[['node', time_col]].duplicated().any()}")

# note the trick here! this happens again in the future for Promethues data
df_last = df.groupby(['node', time_col], as_index=False)[['node', time_col, 'state', 'time']].tail(1).copy()
# merge it with the main
df_stat = pd.merge(df_stat, df_last[['node', time_col, 'state', 'time']],
                   how='left', on=['node', time_col]).copy()
# rename it
df_stat.rename(columns={'time':'time_for_last_state', 'state': 'last_state'}, inplace=True)
# show a sample
display(df_stat.sample(n=10))
print(f"Is there duplication in node and time: {df_stat[['node', time_col]].duplicated().any()}")

Unnamed: 0,node,time_4hour_interval,idle_duration,all_state_durations_in_interval,idle_proportion
178370,tcn548,2024-11-05 16:00:00,0 days 00:00:00,0 days 03:59:11,0.0
162185,tcn461,2024-12-01 12:00:00,0 days 00:00:00,0 days 03:59:07,0.0
175936,tcn534,2024-11-30 00:00:00,0 days 01:27:43,0 days 03:59:49,0.3658
51354,tcn1016,2024-11-21 00:00:00,0 days 00:00:00,0 days 03:58:46,0.0
4461,fcn13,2024-12-01 12:00:00,0 days 03:59:07,0 days 03:59:07,1.0


264536
Is there duplication in node and time: False


Unnamed: 0,node,time_4hour_interval,idle_duration,all_state_durations_in_interval,idle_proportion,last_state,time_for_last_state
10003,fcn43,2024-11-09 04:00:00,0 days 00:00:00,0 days 03:59:20,0.0,allocated,2024-11-09 07:59:31
139684,tcn344,2024-11-08 00:00:00,0 days 00:00:00,0 days 03:59:40,0.0,allocated,2024-11-08 03:59:46
38472,gcn5,2024-11-24 00:00:00,0 days 00:00:00,0 days 03:58:37,0.0,reserved,2024-11-24 03:59:46
249549,tcn92,2024-11-29 12:00:00,0 days 01:28:09,0 days 03:58:23,0.3698,allocated,2024-11-29 15:59:29
2532,fcn111,2024-11-25 08:00:00,0 days 03:59:11,0 days 03:59:11,1.0,idle,2024-11-25 11:59:37
40931,gcn63,2024-12-02 12:00:00,0 days 00:00:00,0 days 03:58:18,0.0,mixed,2024-12-02 15:58:37
256205,tcn955,2024-11-20 20:00:00,0 days 03:59:04,0 days 03:59:04,1.0,idle,2024-11-20 23:59:53
94096,tcn1240,2024-11-06 16:00:00,0 days 00:00:00,0 days 03:59:14,0.0,mixed,2024-11-06 19:59:36
169622,tcn500,2024-11-09 16:00:00,0 days 00:00:00,0 days 03:59:12,0.0,allocated,2024-11-09 19:59:39
210308,tcn714,2024-11-25 08:00:00,0 days 00:58:41,0 days 03:59:11,0.2453,allocated,2024-11-25 11:59:37


Is there duplication in node and time: False


In [29]:
# drop some columns
df_stat.drop(['idle_duration', 'all_state_durations_in_interval', 'time_for_last_state', 'last_state'],
             axis=1, inplace=True)
df_stat.head()




Unnamed: 0,node,time_4hour_interval,idle_proportion
0,fcn1,2024-11-04 16:00:00,0.0
1,fcn1,2024-11-04 20:00:00,0.0
2,fcn1,2024-11-05 00:00:00,0.0
3,fcn1,2024-11-05 04:00:00,0.0
4,fcn1,2024-11-05 08:00:00,0.0


# Train Test Valildation Split


It could be better to ro the train and test split later on


In [30]:
# put the mask here
test_mask = (df_stat[time_col].dt.month==11) & ((df_stat[time_col].dt.day==29)|(df_stat[time_col].dt.day==30))
val_mask = (df_stat[time_col].dt.month==11) & (df_stat[time_col].dt.day==28)


train = df_stat[~(val_mask|test_mask)]
val = df_stat[val_mask]
test = df_stat[test_mask]
print(f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}")
print(f"Max Date in Train: {train[time_col].max()} | Min Date in Validation: {val[time_col].min()} | Min Date in Test: {test[time_col].min()}")

# of Training samples: 236852 | # of Validation samples: 9228 | # of Test samples: 18456
Max Date in Train: 2024-12-03 04:00:00 | Min Date in Validation: 2024-11-28 00:00:00 | Min Date in Test: 2024-11-29 00:00:00
