In [1]:
import os
import dask.dataframe as dd
import pandas as pd

In [2]:
root_dataset = "path to machine metric dataset"
folders = next(os.walk(root_dataset))[1]

In [3]:
# Compute generic data to be processed/printer later
all_columns = set()
max_number_of_samples_per_metric = 0
total_number_metric_samples = 0
total_number_of_valid_metric_samples = 0  # Metric samples minus NaNs in the data
for folder in folders:
    if "nvidia" in folder: continue  # Dask cannot handle multi Index dataframes
        
    # We use dask to lazy-load the datafame, only read as little as needed
    try:
        df = pd.read_parquet(os.path.join(root_dataset, folder), engine="pyarrow")
        all_columns.update(df.columns)
        max_number_of_samples_per_metric = max(max_number_of_samples_per_metric, len(df))
        total_metrics_in_dataset = df.shape[0] * df.shape[1]
        total_number_metric_samples += total_metrics_in_dataset
        total_number_of_valid_metric_samples += total_metrics_in_dataset - df.isnull().sum().sum()
        
    except:
        continue

In [5]:
print(f"Number of metrics in the dataset: {len(folders)}")
print(f"Number of nodes in the dataset: {len(all_columns)}")
print(f"Number of racks in the dataset: {len(set([s.split('n')[0] for s in all_columns]))}")
print(f"Maximum number of samples per metric: {'{:,}'.format(max_number_of_samples_per_metric)}")
print(f"Total samples in the dataset: {'{:,}'.format(total_number_metric_samples)}")
print(f"Total valid samples in the dataset: {'{:,}'.format(total_number_of_valid_metric_samples)}")

Number of metrics in the dataset: 327
Number of nodes in the dataset: 341
Number of racks in the dataset: 20
Maximum number of samples per metric: 1,258,646
Total samples in the dataset: 66,541,895,243
Total valid samples in the dataset: 63,978,689,791


In [4]:
set([s.split('n')[0] for s in all_columns])

{'r10',
 'r11',
 'r12',
 'r13',
 'r14',
 'r15',
 'r23',
 'r25',
 'r26',
 'r27',
 'r28',
 'r29',
 'r30',
 'r31',
 'r32',
 'r33',
 'r34',
 'r35',
 'r36',
 'r38'}

In [None]:
min_date_dataset = None
max_date_dataset = None
for folder in folders:
    for file in next(os.walk(os.path.join(root_dataset, folder)))[2]:
        start_time, end_time = file.replace(".parquet", "").split("_")
        min_date_dataset = int(start_time) if min_date_dataset is None else min(int(start_time), min_date_dataset)
        max_date_dataset = int(end_time) if max_date_dataset is None else max(int(end_time), max_date_dataset)

In [None]:
print(f"Min timestamp: {pd.to_datetime(min_date_dataset, unit='s')}, and max: {pd.to_datetime(max_date_dataset, unit='s')}")

In [None]:
df = pd.read_parquet(os.path.join(root_dataset, "nvidia_gpu_memory_used_bytes/"))

print(df.columns)
print(f"Number of GPUs: {len(df.columns)}")

GPU_mem_in_GB = 0
for node,gpu in df.columns:
    if "1080" in gpu:
        GPU_mem_in_GB += 11
    elif "TITAN RTX" in gpu:
        GPU_mem_in_GB += 24
    else:
        GPU_mem_in_GB += 12
        
print(f"Total GPU memory: {GPU_mem_in_GB}")