In [1]:
import os
import pandas as pd
import pytz
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
from plotnine import *
import calendar
import numpy as np
import databricks.koalas as ks 
import seaborn as sns
from datetime import datetime

In [3]:
data_root = "path to machine metric dataset"

loads = [
    ('load1', 'node_load1', 'node_load1_diurnal.parquet'),
    ('load5', 'node_load5', 'node_load5_diurnal.parquet'),
    ('load15', 'node_load15', 'node_load15_diurnal.parquet'),
]

In [3]:
df = pd.read_parquet(data_root + 'node_load1')
df = df.stack()

In [38]:
df.index.names = ['time', 'node']
df = df.rename("load1").to_frame()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,load1
time,node,Unnamed: 2_level_1
1577833200,r1899n7,3.44
1577833200,r1899n1899,16.00
1577833200,r1899n1898,16.00
1577833200,r1899n1897,15.82
1577833200,r1899n1896,16.00
...,...,...
1585864785,r1379n7,0.00
1585864785,r1379n4,0.00
1585864785,r1379n5,0.00
1585864785,r1379n2,0.00


In [5]:
color = ['lightcoral', 'steelblue', 'yellowgreen']
marker = ['o', '^', 's']
hatch = ['', '/', '\\']

fig, ax = plt.subplots(figsize=(11,5))

index = 0
barWidth = 0.25
offset = [-barWidth, 0, barWidth]
df = None

for load, folder_name, processed_data_path in loads:
    
    cach_file = os.path.join("./cache", f"loads_diurnal_hourly_cache_{load}.npy")
    
    if not os.path.isfile(cach_file):
        df = pd.read_parquet(data_root + folder_name)

        # Pivot all columns so that it becomes a multi-index of (time, node).
        df = df.stack()
        # Set the names of the multi-index
        df.index.names = ['time', 'node']
        # Change the series name to the load name and then make it a dataframe
        df = df.rename(load).to_frame()

        # Drop all rows that do not feature at least one value >= 0
        df = df.loc[(df >= 0).any(axis=1)]

        df.reset_index(inplace=True)
        df["dt"] = pd.to_datetime(df['time'], utc=True, unit="s")
        # Convert everything into localized Amsterdam time and then drop the timezone info again
        # dropping it is required to save the parquet file.
        df["dt"] = df["dt"].dt.tz_convert(pytz.timezone('Europe/Amsterdam')).dt.tz_localize(None)
        # Get hour of day and day columns to plot
        df["hour_of_day"] = df["dt"].dt.hour

        yerr_vals = df.groupby("hour_of_day")[load].std()
        df = df.groupby("hour_of_day").mean()
        x_vals = np.arange(len(df[load])) + offset[index]
        y_vals = df[load]
        
        with open(cach_file, 'wb') as cache_file:
            np.save(cache_file, x_vals)
            np.save(cache_file, y_vals)
            np.save(cache_file, yerr_vals)
    else:
        with open(cach_file, 'rb') as cache_file:
            x_vals = np.load(cache_file)
            y_vals = np.load(cache_file)
            yerr_vals = np.load(cache_file)
    
    negative_direction_values = np.zeros(len(yerr_vals))  # We create a 2d array to make sure matplotlib does not create downwards errorbars
    ax.bar(x_vals, y_vals, yerr=[negative_direction_values, yerr_vals], edgecolor='black', color=color[index], hatch=hatch[index], label=load, width=barWidth, capsize=3)
    index += 1

ax.set_xlim(left=-1)
ax.set_ylim(bottom=0, top=100)
ax.set_xlabel("Hour of Day", fontsize=20)
ax.set_ylabel("Load", fontsize=20)
ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=16)
ax.legend(ncol=len(color), prop={"size": 14}, bbox_to_anchor=(0.5, 1.15), loc=9)
fig.tight_layout()

date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

fig.savefig(f"loads_diurnal_hourly_{date_time}.pdf")


del fig
del ax
if df: del df

In [6]:
color = ['lightcoral', 'steelblue', 'yellowgreen']
marker = ['o', '^', 's']
hatch = ['', '/', '\\']

fig, ax = plt.subplots(figsize=(11,5))

index = 0
offset = [-0.3, 0, 0.3]
df = None

for load, folder_name, processed_data_path in loads:
    
    cach_file = os.path.join("./cache", f"loads_daily_cache_{load}.npy")
    
    if not os.path.isfile(cach_file):
        df = pd.read_parquet(data_root + folder_name)

        # Pivot all columns so that it becomes a multi-index of (time, node).
        df = df.stack()
        # Set the names of the multi-index
        df.index.names = ['time', 'node']
        # Change the series name to the load name and then make it a dataframe
        df = df.rename(load).to_frame()

        # Drop all rows that do not feature at least one value >= 0
        df = df[(df >= 0).any(axis=1)]

        df.reset_index(inplace=True)
        df["dt"] = pd.to_datetime(df['time'], utc=True, unit="s")
        # Convert everything into localized Amsterdam time and then drop the timezone info again
        # dropping it is required to save the parquet file.
        df["dt"] = df["dt"].dt.tz_convert(pytz.timezone('Europe/Amsterdam')).dt.tz_localize(None)
        # Get hour of day and day columns to plot
    #     df["hour_of_day"] = df["dt"].dt.hour
        df["day"] = df["dt"].apply(lambda x : x.weekday())

        yerr_vals = df.groupby("day")[load].std()

        df = df.groupby("day").mean()
        x_vals = np.arange(len(df[load])) + offset[index]
        y_vals = df[load]

        with open(cach_file, 'wb') as cache_file:
            np.save(cache_file, x_vals)
            np.save(cache_file, y_vals)
            np.save(cache_file, yerr_vals)
    else:
        with open(cach_file, 'rb') as cache_file:
            x_vals = np.load(cache_file)
            y_vals = np.load(cache_file)
            yerr_vals = np.load(cache_file)
    
    negative_direction_values = np.zeros(len(yerr_vals))  # We create a 2d array to make sure matplotlib does not create downwards errorbars
    ax.bar(x_vals, y_vals, yerr=[negative_direction_values, yerr_vals], edgecolor='black', color=color[index], hatch=hatch[index], label=load, width=barWidth, capsize=3)
    index += 1

ax.set_xlim(left=-1)
ax.set_ylim(bottom=0, top=100)
ax.set_xticks(list(np.arange(7)))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax.set_xlabel("Day of Week", fontsize=20)
ax.set_ylabel("Load", fontsize=20)
ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=16)
ax.legend(ncol=len(color), prop={"size": 14}, bbox_to_anchor=(0.5, 1.15), loc=9)
fig.tight_layout()

date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
fig.savefig(f"loads_diurnal_daily_{date_time}.pdf")

del fig
del ax
if df: del df

In [3]:
print(df["dt"].min(), df["dt"].max())
print(len(df), len(df.columns))

2020-01-01 00:00:00 2020-04-02 23:59:45
524160 344 524160


In [None]:
# First, compute the average for the same hours per node, then the mean across all nodes
load1_per_node_per_hour = df.groupby("hour_of_day").mean()
load1_per_hour = load1_per_node_per_hour.mean(axis=1).reset_index()
load1_per_hour.columns = ["hour_of_day", "load"]
print(load1_per_hour.head())

In [None]:
# This cell plots the average across all nodes.
plt = ggplot(load1_per_hour) +\
    theme_light(base_size=16) +\
    theme(legend_title=element_text(size=0, alpha=0),
                       legend_box_spacing=0.1,
                       legend_box_margin=0,
                       legend_margin=0,
          legend_position=(0.51, 0.7),
          legend_direction="horizontal",
          legend_key=element_blank(),
          legend_background=element_rect(fill=(0,0,0,0))) +\
    guides(color=guide_legend(ncol=3)) +\
    geom_line(aes(x="hour_of_day", y="load")) +\
    geom_point(aes(x="hour_of_day", y="load"), size=3) +\
    ylim(0,None) +\
    xlab("Hour of day") +\
    ylab("Avg. load1 across all nodes")

plt.save("load1_per_hour.pdf")
plt

In [None]:
# First, compute the average for the same hours per node, then the mean across all nodes
load1_per_node_per_day = df.groupby("day").mean()
load1_per_day = load1_per_node_per_day.mean(axis=1).reset_index()
load1_per_day.columns = ["day", "load"]

In [None]:
plt = ggplot(load1_per_day) +\
    theme_light(base_size=16) +\
    theme(legend_title=element_text(size=0, alpha=0),
                       legend_box_spacing=0.1,
                       legend_box_margin=0,
                       legend_margin=0,
          legend_position=(0.51, 0.7),
          legend_direction="horizontal",
          legend_key=element_blank(),
          legend_background=element_rect(fill=(0,0,0,0))) +\
    guides(color=guide_legend(ncol=3)) +\
    geom_line(aes(x="day", y="load")) +\
    geom_point(aes(x="day", y="load"), size=3) +\
    ylim(0,None) +\
    xlab("Day in Week (0=Monday, 6=Sunday)") +\
    ylab("Avg. load1 across all nodes")

plt.save("load1_per_day.pdf")
plt


In [None]:
# # Make bins of 15 minutes using resample and then create a sliding window so that for every 15 minutes we get the mean load.
# bin_df = df.copy()
# bin_df.index = pd.to_datetime(bin_df.index, unit="s")
# # Bin per 15 minute and create a sliding window of 1 hour.
# # We take the right timestamp of the bin as this is the current time when measuring the mean.
# bin_df = bin_df.resample("15min", label='right').mean().rolling('1h').mean()
# bin_df = bin_df.dropna(how="all")  # Remove all rows with only NaN values

# # IMPORTANT: as we took all right labels of each bin, the hour_of_day and day themselves are now incorrect, 
# # as all timestamps effectively shifted by 15 minutens. We need to recompute them.
# bin_df["dt"] = pd.to_datetime(bin_df.index, unit="s")  # No need to convert time timezones again, this was already done!
# bin_df["hour_of_day"] = bin_df["dt"].dt.hour
# bin_df["day"] = bin_df["dt"].apply(lambda x : x.weekday())

# bin_df

In [None]:
# load1_per_node_per_hour = bin_df.groupby("hour_of_day").mean()
# load1_per_hour = load1_per_node_per_hour.mean(axis=1).reset_index()
# load1_per_hour.columns = ["hour_of_day", "load"]

In [None]:
# This cell plots the average across all nodes per 15m using a rollowing window of 1 hour
# plt = ggplot(load1_per_hour) +\
#     theme_light(base_size=16) +\
#     theme(legend_title=element_text(size=0, alpha=0),
#                        legend_box_spacing=0.1,
#                        legend_box_margin=0,
#                        legend_margin=0,
#           legend_position=(0.51, 0.7),
#           legend_direction="horizontal",
#           legend_key=element_blank(),
#           legend_background=element_rect(fill=(0,0,0,0))) +\
#     guides(color=guide_legend(ncol=3)) +\
#     geom_line(aes(x="hour_of_day", y="load")) +\
#     geom_point(aes(x="hour_of_day", y="load"), size=3) +\
#     ylim(0,None) +\
#     xlab("Hour of day") +\
#     ylab("Avg. load1 across all nodes")

plt = ggplot(load1_per_hour, aes(x="hour_of_day", y="load")) +\
    theme_light(base_size=16) +\
    theme(axis_text_x = element_text(angle = 45)) +\
    geom_bar(stat = "identity") +\
    ylim(0,None) +\
    xlab("Hour of day") +\
    ylab("Avg. load1 across all nodes")

plt.save("load1_per_hour_of_day_diurnal_15min_bin_1h_window.pdf")
plt

In [None]:
# First, compute the average for the same hours per node, then the mean across all nodes
load1_per_node_per_day = bin_df.groupby("day").mean()
load1_per_day = load1_per_node_per_day.mean(axis=1).reset_index()
load1_per_day.columns = ["day", "load"]

In [None]:
plt = ggplot(load1_per_day, aes(x="day", y="load")) +\
    theme_light(base_size=16) +\
    theme(axis_text_x = element_text(angle = 45)) +\
    geom_bar(stat = "identity") +\
    ylim(0,None) +\
    xlab("Day in Week") +\
    ylab("Avg. load1 across all nodes") +\
    scale_x_continuous(breaks=list(range(0,7)), labels=list(calendar.day_name))

plt.save("load1_per_day_of_week_diurnal_15min_bin_1h_window.pdf")
plt

In [None]:
# Create a series of hour of day -> all values to plot in a violin/boxplot.
def get_values(rows):
    print(rows.columns)
    hour = rows['hour_of_day'].iloc[0]
    rows.drop('hour_of_day', axis=1, inplace=True)
    arr = rows.to_numpy()
    return arr[arr >= 0].ravel()

ndf = df.groupby('hour_of_day').apply(get_values)
ndf

In [None]:
# Create per hour a violin boxplot plot
plt = ggplot(hour_of_day_df, aes(x="hour_of_day", y="values")) +\
    geom_violin(width=0.2) +\
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +\
    scale_fill_cmap(discrete = True) +\
    theme_light() +\
    theme(
      legend_position="none",
      plot_title = element_text(size=11)
    ) +\
    ggtitle("A Violin wrapping a boxplot") +\
    xlab("")

plt