In [None]:
# Collected job data from LISA-GPU using the following command:
# sacct -a --starttime 2020-01-01 --format=jobid,gid,uid,partition,submit,start,end,elapsedraw,cputimeraw,ncpus,nnodes,nodelist,exitcode,state,timelimit > $HOME/jobdata.csv

# Required packages for processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time, datetime, pytz
from matplotlib.ticker import MultipleLocator, FixedLocator, LogLocator, NullFormatter
from datetime import date, datetime, time

%matplotlib inline

In [None]:
# Required preprocessing/parsing of the job data
def preprocess_jobdata_to_df(name):
    with open(name,'r') as file:
        filedata = file.read()
        filedata = filedata.replace('None assigned','NoneAssigned')
    with open(str('processed_'+name),'w') as file:
        file.write(filedata)
    jobdata = pd.read_fwf(str('processed_'+name), delimiter=r"\s+", header=None)#, low_memory=False)
    jobdata = jobdata.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    jobdata = jobdata.rename(columns=jobdata.iloc[0]).drop(jobdata.index[0])
    jobdata = jobdata.iloc[1:]
    jobdata = jobdata.astype({"ElapsedRaw": int, "CPUTimeRAW": int, "NCPUS": int})
    return(jobdata)

In [None]:
# Import job data
jobdata = preprocess_jobdata_to_df('jobdata.csv')
print(jobdata)

In [None]:
jobdata.dtypes

In [None]:
# Some extra preprocessing required for the job data, e.g. filtering out irrelevant login nodes
filtered_jobs = jobdata[(jobdata["Start"] >= '2020-01-01 00:00:00') & (jobdata["Start"] <= '2020-08-05 00:00:00')]
filtered_jobs = filtered_jobs[(~filtered_jobs["NodeList"].str.contains("None")) & (~filtered_jobs["NodeList"].str.contains("software")) & (~filtered_jobs["NodeList"].str.contains("login"))]
filtered_jobs["ElapsedRaw"] = filtered_jobs["ElapsedRaw"].apply(lambda x: (x / 60) / 60)
print(filtered_jobs)

In [None]:
# Generate array containing counts of job lenght 
counts = np.unique(filtered_jobs["ElapsedRaw"].values, return_counts=True)
print(counts)

In [None]:
# Visualize results
fig = plt.figure()
plt.yscale("log")
plt.locator_params(axis='y', numticks=12)
plt.bar(counts[0], counts[1], width=1)
plt.xlabel("Job duration (hours)")
plt.ylabel("Jobs")
plt.tight_layout()
fig.savefig("job_duration_count.pdf")

In [None]:
# Generate PDF/CDF
pdf = counts[1] / np.sum(counts[1])
cdf = np.cumsum(pdf)
fig = plt.figure()

plt.ylim(0,1.05)
plt.plot(counts[0], cdf, drawstyle='steps')
plt.xscale("log")
plt.xlabel("Job duration (hours)")
plt.ylabel("Fraction of total jobs")
plt.tight_layout()
fig.savefig("job_duration_cdf.pdf")

In [None]:
fig, ax1 = plt.subplots()

ax1.set_xlabel('Job duration (hours)')
ax1.set_ylabel('PDF')
plt.ylim(-0.05,1.05)
plt.xscale("log")
ax1.plot(counts[0], pdf, color='black', drawstyle='steps')
ax1.tick_params(axis='y')

ax2 = ax1.twinx()

color = 'tab:blue'
ax2.set_ylabel('CDF - Fraction of total jobs', color=color)
ax2.plot(counts[0], cdf, color=color, drawstyle='steps')
ax2.tick_params(axis='y', labelcolor=color)
plt.ylim(-0.05,1.05)

fig.tight_layout()
plt.show()
fig.savefig("job_duration_cdf_pdf.pdf")

In [None]:
# Generate array containing counts of cpu core allocation for jobs 
counts_cpus = np.unique(filtered_jobs["NCPUS"].values, return_counts=True)
print(counts_cpus)

In [None]:
# Visualize results
fig, ax = plt.subplots()

plt.yscale("log")
plt.ylim(bottom=1)
plt.ylim(top=10**6.5)
plt.bar(counts_cpus[0], counts_cpus[1], width=1)
plt.xlabel("CPU cores in job")

majors = [0, 100, 200, 300, 400, 500, 600]
ax.xaxis.set_major_locator(FixedLocator(majors))
ax.xaxis.set_minor_locator(FixedLocator((np.arange(min(counts_cpus[0]-1), 600, 10))))

plt.ylabel("Jobs")
plt.tight_layout()

a = plt.axes([.495, .55, .45, .35], facecolor='lightgrey')
plt.bar(counts_cpus[0][0:23], counts_cpus[1][0:23], width=0.3)
plt.title('Zoomed-in view')
plt.yscale("log")
majors_2 = [0, 10, 20, 30, 40, 50]
a.xaxis.set_major_locator(FixedLocator(majors_2))
a.xaxis.set_minor_locator(FixedLocator((np.arange(min(counts_cpus[0]-1), 50, 1))))

plt.show()
fig.savefig("job_cpus_count_zoom.pdf")

In [None]:
# Plot PDF/CDF
pdf = counts_cpus[1] / np.sum(counts_cpus[1])
cdf = np.cumsum(pdf)
fig = plt.figure()
plt.ylim(0,1.05)
plt.plot(counts_cpus[0], cdf, drawstyle='steps')
plt.xscale("symlog")
plt.xlabel("CPU cores in job")
plt.ylabel("Fraction of total jobs")
plt.tight_layout()
fig.savefig("job_cpus_cdf.pdf")

In [None]:
# Note there seems to be some issue with the plotting library, which does not seem to show every first 
# minor tick after a major tick on log/symlog scales, even when this is hard-coded like below
fig, ax1 = plt.subplots()

ax1.set_xlabel('Number of cores in job')
ax1.set_ylabel('PDF')
plt.ylim(-0.05,1.05)
plt.xscale("symlog", subsx = [1,2,3,4,5,6,7,8,9])
ax1.plot(counts_cpus[0], pdf, color='black', drawstyle='steps')

ax2 = ax1.twinx()

color = 'tab:blue'
ax2.set_ylabel('CDF - Fraction of total jobs', color=color)
ax2.plot(counts_cpus[0], cdf, color=color, drawstyle='steps')
ax2.tick_params(axis='y', labelcolor=color)
plt.ylim(-0.05,1.05)

fig.tight_layout()
plt.show()
fig.savefig("job_cpus_cdf_pdf.pdf")

In [None]:
# Generate list of dates in dataset, again excluding any login nodes from the data
group_date = jobdata[(jobdata["Submit"] >= '2020-01-01 00:00:00') & (jobdata["Submit"] <= '2020-08-05 00:00:00')]
group_date = group_date[(~group_date["NodeList"].str.contains("None")) & (~group_date["NodeList"].str.contains("software")) & (~group_date["NodeList"].str.contains("login"))]
group_date["Submit"] = pd.to_datetime(group_date["Submit"], utc=True)
group_date = group_date.set_index("Submit")
group_date = group_date.groupby(group_date.index.date).count()
datelist = list(group_date.index.values) 
submissionlist = group_date["JobID"].tolist()

In [None]:
group_date = jobdata[(jobdata["Submit"] >= '2020-01-01 00:00:00') & (jobdata["Submit"] <= '2020-08-05 00:00:00')]
group_date = group_date[(~group_date["NodeList"].str.contains("None")) & (~group_date["NodeList"].str.contains("software")) & (~group_date["NodeList"].str.contains("login"))]
group_date["squashed_area"] = group_date["ElapsedRaw"] * group_date["NCPUS"]
group_date["Submit"] = pd.to_datetime(group_date["Submit"], utc=True)
group_date = group_date.set_index("Submit")
daily_squashed_area = group_date['squashed_area'].resample('D').sum()

# Plot CPU-Hours per day
fig = plt.figure()
plt.yscale("log")
plt.ylim(bottom=1)
plt.ylim(top=10**9.5)
plt.locator_params(axis='y', numticks=12)
plt.bar(datelist, daily_squashed_area, width=1)
plt.xlabel("Date")
plt.ylabel("CPU-Hours")
plt.tight_layout()
fig.savefig("job_squashed_area.pdf")

In [None]:
# Generate the same plot with a broken/interrupted axis
f, (ax, ax2) = plt.subplots(2, 1, sharex=True)

# plot the same data on both axes
ax.bar(datelist, daily_squashed_area, width=1)
ax2.bar(datelist, daily_squashed_area, width=1)

ax.set_yscale("log")
ax2.set_yscale("log")

# zoom-in / limit the view to different portions of the data
ax.set_ylim(10**7,10**9.25)
ax2.set_ylim(10**0, 10**3)

# hide the spines between ax and ax2
ax.spines['bottom'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.xaxis.tick_bottom()

d = .015  # how big to make the diagonal lines in axes coordinates
kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
ax.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal

plt.xlabel("Date")
plt.ylabel("CPU-Hours")
plt.tight_layout()
f.savefig("job_squashed_area_split.pdf")

In [None]:
# Plot daily number of job submissions
fig = plt.figure()
plt.yscale("symlog", subsy = [1,2,3,4,5,6,7,8,9])
plt.ylim(bottom=1)
plt.ylim(top=10**5.5)
plt.locator_params(axis='y', numticks=12)
plt.bar(datelist, submissionlist, width=1)
plt.xlabel("Date")
plt.ylabel("Number of job submissions")
plt.tight_layout()
fig.savefig("job_submission_count.pdf")