# Collectl Log Analysis

## Functionalities
- Plot CPU utilization graphs.
- Plot memory utilization graphs.
- Plot disk I/O utilization graphs.

## Input
Log files are read from a directory in `../data`. This directory is assumed to have the following structure:
```
logs/
  [node-1]/
    collectl.tar.gz
  ...
  [node-n]/
    collectl.tar.gz
```
A tarball `collectl.tar.gz` contains log files. The log file extension identifies the type of resource monitored:
- `.cpu.gz`: CPU monitoring log file.
- `.numa.gz`: memory monitoring log file.
- `.dsk.gz`: disk I/O monitoring log file.

## Notebook Configuration

In [None]:
########## GENERAL
# Name of the directory in `../data`
EXPERIMENT_DIRNAME = "BuzzBlogBenchmark_2021-10-10-18-35-22"

########## CPU
# Analyzed metric (options: "user", "nice", "system", "wait", "irq", "soft",
# "steal", "idle", "total", "guest", "guest_n", "intrpt")
COLLECTL_CPU_METRIC = "total"

########## MEMORY
# Analyzed metric (options: "used", "free", "slab", "mapped", "anon", "anonh", "inactive", "hits")
COLLECTL_MEM_METRIC = "free"

########## DISK I/O
# Analyzed metric (options: "reads", "rmerge", "rkbytes", "waitr", "writes", "wmerge", "wkbytes", "waitw", "request",
# "quelen", "wait", "svctim", "util")
COLLECTL_DSK_METRIC = "quelen"

## Notebook Setup

In [None]:
# Import libraries.
%matplotlib inline
import datetime
import gzip
import matplotlib.pyplot as plt
import os
import pandas as pd
import tarfile
import warnings
warnings.filterwarnings("ignore")

# Constants
CPU_METRICS = ["user", "nice", "system", "wait", "irq", "soft", "steal", "idle", "total", "guest", "guest_n", "intrpt"]
MEM_METRICS = ["used", "free", "slab", "mapped", "anon", "anonh", "inactive", "hits"]
DSK_METRICS = ["reads", "rmerge", "rkbytes", "waitr", "writes", "wmerge", "wkbytes", "waitw", "request",
    "quelen", "wait", "svctim", "util"]

## Log Parsing

In [None]:
# Parse logs
cpu = {"node_name": [], "core_no": [], "timestamp": [], "metric": [], "value": []}
mem = {"node_name": [], "numa_node": [], "timestamp": [], "metric": [], "value": []}
dsk = {"node_name": [], "dsk_no": [], "timestamp": [], "metric": [], "value": []}
node_names = os.listdir(os.path.join(os.pardir, "data", EXPERIMENT_DIRNAME, "logs"))
for node_name in node_names:
    node_min_timestamp = None
    tarball_path = os.path.join(os.pardir, "data", EXPERIMENT_DIRNAME, "logs", node_name, "collectl.tar.gz")
    with tarfile.open(tarball_path, "r:gz") as tar:
        for filename in tar.getnames():
            if filename.endswith(".cpu.gz"):
                with gzip.open(tar.extractfile(filename), "rt") as cpu_log_file:
                    for log in cpu_log_file:
                        if log[0] == '#':
                            # Skip comments.
                            continue
                        log_entry = log.split()
                        timestamp = datetime.datetime.strptime(" ".join(log_entry[:2]), "%Y%m%d %H:%M:%S.%f")
                        if node_min_timestamp is None:
                            node_min_timestamp = timestamp
                        for core_no in range((len(log_entry) - 2) // len(CPU_METRICS)):
                            for (i, metric) in enumerate(CPU_METRICS):
                                cpu["node_name"].append(node_name)
                                cpu["core_no"].append(core_no)
                                cpu["timestamp"].append((timestamp - node_min_timestamp).total_seconds())
                                cpu["metric"].append(metric)
                                cpu["value"].append(float(log_entry[i + 2 + core_no * len(CPU_METRICS)]))
            if filename.endswith(".numa.gz"):
                with gzip.open(tar.extractfile(filename), "rt") as mem_log_file:
                    for log in mem_log_file:
                        if log[0] == '#':
                            # Skip comments.
                            continue
                        log_entry = log.split()
                        timestamp = datetime.datetime.strptime(" ".join(log_entry[:2]), "%Y%m%d %H:%M:%S.%f")
                        if node_min_timestamp is None:
                            node_min_timestamp = timestamp
                        for numa_node in range((len(log_entry) - 2) // len(MEM_METRICS)):
                            for (i, metric) in enumerate(MEM_METRICS):
                                mem["node_name"].append(node_name)
                                mem["numa_node"].append(numa_node)
                                mem["timestamp"].append((timestamp - node_min_timestamp).total_seconds())
                                mem["metric"].append(metric)
                                mem["value"].append(float(log_entry[i + 2 + numa_node * len(MEM_METRICS)]))
            if filename.endswith(".dsk.gz"):
                with gzip.open(tar.extractfile(filename), "rt") as dsk_log_file:
                    for log in dsk_log_file:
                        if log[0] == '#':
                            # Skip comments.
                            continue
                        log_entry = log.split()
                        timestamp = datetime.datetime.strptime(" ".join(log_entry[:2]), "%Y%m%d %H:%M:%S.%f")
                        if node_min_timestamp is None:
                            node_min_timestamp = timestamp
                        for dsk_no in range((len(log_entry) - 2) // (len(DSK_METRICS) + 1)):
                            for (i, metric) in enumerate(DSK_METRICS):
                                dsk["node_name"].append(node_name)
                                dsk["dsk_no"].append(dsk_no)
                                dsk["timestamp"].append((timestamp - node_min_timestamp).total_seconds())
                                dsk["metric"].append(metric)
                                dsk["value"].append(float(log_entry[i + 3 + dsk_no * (len(DSK_METRICS) + 1)]))

In [None]:
# Build data frames
cpu = pd.DataFrame.from_dict(cpu)
mem = pd.DataFrame.from_dict(mem)
dsk = pd.DataFrame.from_dict(dsk)

## CPU Monitoring

In [None]:
# Plot CPU utilization
fig = plt.figure(figsize=(24, len(node_names) * 12))
for (i, node_name) in enumerate(node_names):
    df = cpu[(cpu["node_name"] == node_name) & (cpu["metric"] == COLLECTL_CPU_METRIC)]
    df = df.groupby(["timestamp", "core_no"])["value"].mean()
    df = df.unstack()
    ax = fig.add_subplot(len(node_names), 1, i + 1)
    ax.set_xlim((df.index.min(), df.index.max()))
    ax.set_ylim((0, 100))
    ax.grid(alpha=0.75)
    df.plot(ax=ax, kind="line", title="%s - CPU Utilization" % node_name, xlabel="Time (seconds)",
        ylabel="%s (%%)" % COLLECTL_CPU_METRIC, grid=True, legend=False, yticks=range(0, 101, 10))

## Memory Monitoring

In [None]:
# Plot memory utilization
fig = plt.figure(figsize=(24, len(node_names) * 12))
for (i, node_name) in enumerate(node_names):
    df = mem[(mem["node_name"] == node_name) & (mem["metric"] == COLLECTL_MEM_METRIC)]
    df = df.groupby(["timestamp", "numa_node"])["value"].mean()
    df = df.unstack()
    ax = fig.add_subplot(len(node_names), 1, i + 1)
    ax.set_xlim((df.index.min(), df.index.max()))
    ax.set_ylim((0, df.values.max()))
    ax.grid(alpha=0.75)
    df.plot(ax=ax, kind="line", title="%s - Mem Utilization" % node_name, xlabel="Time (seconds)",
        ylabel="%s" % COLLECTL_MEM_METRIC, grid=True)

## Disk Monitoring

In [None]:
# Plot disk I/O utilization
fig = plt.figure(figsize=(24, len(node_names) * 12))
for (i, node_name) in enumerate(node_names):
    df = dsk[(dsk["node_name"] == node_name) & (dsk["metric"] == COLLECTL_DSK_METRIC)]
    df = df.groupby(["timestamp", "dsk_no"])["value"].mean()
    df = df.unstack()
    ax = fig.add_subplot(len(node_names), 1, i + 1)
    ax.set_xlim((df.index.min(), df.index.max()))
    ax.set_ylim((0, df.values.max()))
    ax.grid(alpha=0.75)
    df.plot(ax=ax, kind="line", title="%s - Disk I/O Utilization" % node_name, xlabel="Time (seconds)",
        ylabel="%s" % COLLECTL_DSK_METRIC, grid=True)