In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from io import StringIO

#### Data Import and Cleaning!

In [None]:
# reformat the CSV to use | instead of , to separate rows, then import [a selection] of the data as a dataframe
for_pd = StringIO()
with open('../data/accre-jobs-2020.csv') as accre:
    for line in accre:
        new_line = re.sub(r',', '|', line.rstrip(), count=12)
        print (new_line, file=for_pd)
for_pd.seek(0)
accre_df = pd.read_csv(for_pd, sep='|')#[1000000:1005000] # add this to subset

In [None]:
# Changing NODES and CPUS to integers

accre_df['NODES'] = accre_df['NODES'].astype(int)
accre_df['CPUS'] = accre_df['CPUS'].astype(int)

##### Converting all times to seconds

In [None]:
## This splits the hour, minutes, seconds from the __TIME columns
accre_df['hours_min_sec_req'] = accre_df['REQTIME'].str[-8:]
accre_df['hours_min_sec_used'] = accre_df['USEDTIME'].str[-8:]

## This splits the day from the ___TIME columns
accre_df['day_req'] = accre_df['REQTIME'].str.extract('(.*?)-')
accre_df['day_used'] = accre_df['USEDTIME'].str.extract('(.*?)-')

## Adds zeros to the day column where null
accre_df['day_req'] = accre_df['day_req'].fillna(0)
accre_df['day_used'] = accre_df['day_used'].fillna(0)

# Converting days to integers to use in converting to seconds
accre_df['day_req'] = accre_df['day_req'].astype(int)
accre_df['day_used'] = accre_df['day_used'].astype(int)

# Converting to timedelta
accre_df['hours_min_sec_req'] =  pd.to_timedelta(accre_df['hours_min_sec_req'], unit='s')
accre_df['hours_min_sec_used'] =  pd.to_timedelta(accre_df['hours_min_sec_used'], unit='s')

# Using timedeltas to then use dt.total_seconds()
accre_df['hours_min_sec_req'] = accre_df['hours_min_sec_req'].dt.total_seconds()
accre_df['hours_min_sec_used'] = accre_df['hours_min_sec_used'].dt.total_seconds()

In [None]:
# Create total seconds columns 
accre_df['total_sec_req'] = (accre_df['day_req'] * 86400) + accre_df['hours_min_sec_req']
accre_df['total_sec_used'] = (accre_df['day_used'] * 86400) + accre_df['hours_min_sec_used']

# Question 1:

While we have systematic checks in place to ensure the general system health of each compute node, we would like to use long-term data to see if there are any clusters of job failures on specific nodes. Do any of the production partition nodes show an unusual number of failed jobs relative to the others? Ignore the debug partition for this question.

In [None]:
# Creating new dataframe subsetted to failed jobs and those found in the production partition

accre_df_failures = accre_df[
    (accre_df['STATE'] == 'FAILED') &
    (accre_df['PARTITION'] == 'production')
]

accre_df_failures = accre_df_failures.reset_index()

In [None]:
# Let's see how much data we have now
accre_df_failures.info()

We started with 3,816,290 in our dataset and are now down to 500 after whittling it down to failures in the production partition. This is a failure rate of .01%

In [None]:
# Creating DF to plot by node using value_counts.  Will also rename the axis and reset the index
failures_by_nodelist = accre_df_failures['NODELIST'].value_counts().rename_axis('NODELIST').reset_index(name='FAILURES')

In [None]:
failures_by_nodelist.plot(kind = 'hist', title = 'Failure Histogram', figsize = (10, 5));

In [None]:
failures_by_nodelist.plot(kind='box', color='red', title='Failures by Node');

In [None]:
failures_by_nodelist.describe()

In [None]:
# Node(s) that fail 3 or more times are considered outliers
failures_by_nodelist[failures_by_nodelist['FAILURES'] > 3]

It may be worth double checking these nodes!

In [None]:
# Visualization of the top failing nodes
failures_by_nodelist.head(16).plot(
    kind = 'bar', 
    x = 'NODELIST',
    y = 'FAILURES',
    title = 'Top 16 Failures by Node (Outliers)',
    color = 'green', 
    figsize = (15,5),
    rot = 25, 
    fontsize = 12.5
);

#### After investigating the reason for failure by Exit Code, it was found that every failure was due to user error. It seems the systematic checks are performing as expected. That said, we want to see which accounts have many failures from user error as a potential opportunity for further training

In [None]:
# Bar chart of failure by account
accre_df_failures['ACCOUNT'].value_counts().plot(kind ='bar', title = 'Failure by Account', figsize=(10,5), rot=75);

In [None]:
# Create DF for failures by account and use box plot to identify outliers
accre_df_failures_by_account = accre_df_failures['ACCOUNT'].value_counts()

red_square = dict(markerfacecolor='r', marker='s')

plt.figure(figsize = (10,5))
plt.title('Outliers from Failures By Account', fontsize=16)
plt.xlabel('Failures')
plt.annotate(s = 'cep', xy = (127, 1.05), fontsize = 12,
             xytext = (126, 1.25), arrowprops=dict(facecolor='black', shrink=0.1, width=1))
plt.annotate(s = 'plantain', xy = (88, .95), fontsize = 12,
             xytext = (81, .75), arrowprops=dict(facecolor='black', shrink=0.1, width = 1))
plt.annotate(s = 'tips', xy = (65, 1.05), fontsize = 12,
             xytext = (65, 1.25), arrowprops=dict(facecolor='black', shrink=0.1, width = 1))
plt.boxplot(accre_df_failures_by_account,  
            vert = False, 
            flierprops=red_square);

The accounts cep, plantain, and tips are failing the most due to user error. I'd suggest reaching out to them to see if they need additional help when running jobs!

# Question 2
The CMS collaboration has an automated job submission system that runs jobs as "cmslocal" and "cmspilot". For these two users, jobs have internal system tests that will terminate their jobs early after approximately 30 minutes. Do any of their jobs that ended in under an hour also cluster on specific compute nodes, suggesting possbily unreliable systems? Check both “production” and “nogpfs” partitions. Look for commonly failing nodes and compare with other failed jobs.

In [None]:
#subsetting data for cms account
cms_df = accre_df[accre_df['ACCOUNT']=='cms']

#subsetting data to get rid of debug partition
cms_df = cms_df[cms_df['PARTITION']!='debug']

#subsetting data for only 'cmspilot' and 'cmslocal' users
cms_df = cms_df[(cms_df['USER']=='cmspilot')|(cms_df['USER']=='cmslocal')]

#printing how many CMS jobs including over an hour
print(cms_df.shape)

#subsetting for time under an hour
cms_df = cms_df[cms_df['total_sec_used'] <= 3600]

#getting only failed jobs
#using state != to complete instead of exit code beacuse its built in their code to crash and error might not be reflected in exit code
cms_failed = cms_df[cms_df['STATE'] != 'COMPLETED']
cms_completed = cms_df[cms_df['STATE'] == 'COMPLETED']

#### EDA and Analysis

In [None]:
#Examining the difference between a 'COMPLETED' job and exitcode '0:0'
pd.crosstab(cms_df['EXITCODE'],cms_df['STATE']).apply(lambda x: (x/x.sum()), axis=1)

In [None]:
#Looking at distribution of seconds to find spike
#where jobs are canceling sround 30 min
cms_df.hist(column = 'total_sec_used');

In [None]:
#How many jobs ended between 500 and 2000 seconds
#or 1000 and 1500 use this one second spike
cms_jobs_ended_around_30_min = cms_df[(cms_df['total_sec_used'] < 1500) & (cms_df['total_sec_used'] > 1000)]

print("Number of CMS Jobs Total:")
print("(699831, 19)")
print("Number of CMS Jobs Under Hour:")
print(cms_df.shape)
print("Number of CMS Jobs Ended Around 30 Minutes:")
print(cms_jobs_ended_around_30_min.shape)

In [None]:
#what percent of jobs is this?
print(str(round((263645/699831)*100,2)) + "% are ending around 30 minutes\nout of total CMS jobs (cmspilot/cmslocal users)")
print("\n")
print(str(round((263645/447255)*100,2)) + "% are ending around 30 minutes\nout of jobs ending in less than an hour")

#### Most failed nodes for CMS

In [None]:
most_failed_nodes = cms_df['NODELIST'].value_counts().head(20)
ax = most_failed_nodes.plot(kind = 'bar', figsize = (8,6))
ax.set_title('CMS Account: Nodes that Failed the Most',weight='bold', size='large')
ax.set_xlabel('Node')
ax.set_ylabel('Number of Time Failed')

In [None]:
#Count of failed nodes for new df for percents
cms_failed['NODELIST'].value_counts().head()

In [None]:
#Count of successful nodes for new df for percents
cms_completed['NODELIST'].value_counts().head()

In [None]:
#Creating df with top nodes to examine by percent of times failed
CMS_nodes = {'NODE':['ng518','cn1314','cn394','ng734','cn475','cn1094','ng1112','cn449','cn1121','cn304','cn1394','cn408','cn1387','cn399','cn363','cn429','cn1398','cn312'],
            'TIMES_FAILED':[16,12,11,10,9,9,9,9,9,9,8,8,8,8,8,8,8,8],
            'TOTAL_TIMES_USED':[19352,12,347,4138,364,397,6171,400,465,298,408,446,249,415,339,306,550,407]}
CMS_nodes_df = pd.DataFrame(CMS_nodes, columns = ['NODE','TIMES_FAILED','TOTAL_TIMES_USED'])

In [None]:
CMS_nodes_df['PERCENT_FAILED'] = (CMS_nodes_df['TIMES_FAILED']/CMS_nodes_df['TOTAL_TIMES_USED'])*100

In [None]:
#plotted with one outlier, maybe remove it to make it easier to see
ax = CMS_nodes_df.sort_values('PERCENT_FAILED').plot.bar(x = 'NODE', y = 'PERCENT_FAILED', figsize = (8,6))
ax.set_title('Percent Node Failed for CMS Account',weight='bold', size='large')
ax.set_xlabel('Node')
ax.set_ylabel('Percent of Times Failed');

In [None]:
#removed cn 1314 to make it to make it easier to see,  label this graph
CMS_nodes_without_cn1314 = CMS_nodes_df[CMS_nodes_df['NODE'] != 'cn1314']
ax = CMS_nodes_without_cn1314.sort_values('PERCENT_FAILED').plot.bar(x = 'NODE', y = 'PERCENT_FAILED',figsize = (8,6))
ax.set_title('Percent Node Failed for CMS Account (without Outlier CN1314)',weight='bold', size='large')
ax.set_xlabel('Node')
ax.set_ylabel('Percent of Times Failed');

In [None]:
CMS_nodes_df.sort_values('PERCENT_FAILED', ascending = False).head()

# Question 3
What groups are best optimizing their memory usage in terms of percent of actual memory used of the memory requested for a job? What is the average percent for each group?

In [None]:
# Create new DF subset to exit code 0:0 and production partition
accre_df_mem = accre_df
accre_df_mem = accre_df_mem[
    (accre_df['EXITCODE'] == '0:0') &
    (accre_df['PARTITION'] == 'production')
]

In [None]:
# Drop unnecessary columns, set axis = 1 to specify dropping columns
accre_df_mem = accre_df_mem.drop(['JOBID','USER', 'REQTIME', 'USEDTIME', 'hours_min_sec_req', 'hours_min_sec_used','day_req', 'day_used', 'NODELIST'], axis =1) 

In [None]:
#new column is for requested memory per node
accre_df_mem['RMPN'] = accre_df_mem['REQMEM'].str.extract('(.*)Mn$')

#new column is for requested memory per core
accre_df_mem['RMPC'] = accre_df_mem['REQMEM'].str.extract('(.*)Mc$')

#new column is for requested memory per core
accre_df_mem['UM'] = accre_df_mem['USEDMEM'].str.extract('(.*)M$')

In [None]:
# change RMPC,RMPN,UM columns to fill NaNs with a 0
accre_df_mem['RMPC'] = accre_df_mem['RMPC'].fillna('0')
accre_df_mem['RMPN'] = accre_df_mem['RMPN'].fillna('0')
accre_df_mem['UM'] = accre_df_mem['UM'].fillna('0')

In [None]:
#fixing types
accre_df_mem['RMPC']= accre_df_mem['RMPC'].astype(str).astype(float)
accre_df_mem['RMPN']= accre_df_mem['RMPN'].astype(str).astype(float)
accre_df_mem['UM']= accre_df_mem['UM'].astype(str).astype(float)

In [None]:
# what is per core??? assume you multiply cpus by node this column will be RMPN times CPUS 
#3.5 is the average node to core ratio
accre_df_mem['RMPN'] = accre_df_mem['RMPN'] /(accre_df_mem['CPUS']/ accre_df_mem['NODES'])

In [None]:
# what is per core??? update RMPC to add the RMPNEW column
accre_df_mem['RMPC'] = accre_df_mem['RMPC'] + accre_df_mem['RMPN']

# remove unneeded columns
accre_df_mem = accre_df_mem.drop(["RMPN"], axis =1)

In [None]:
#convert used memory to cores 
accre_df_mem['UM'] = (accre_df_mem['UM'] /accre_df_mem['CPUS'])/ accre_df_mem['NODES']

In [None]:
# PERCENT of request to actual used
accre_df_mem['PRU'] = (accre_df_mem['UM']/accre_df_mem['RMPC']) * 100

In [None]:
accre_df_mem.head()

In [None]:
accre_df_mem.groupby('ACCOUNT')['PRU'].agg('mean').sort_values(ascending=False)

# Question 4
Optimizing memory is more important for longer running jobs then shorter running jobs as the resources are tied up for longer. If jobs are weighted by runtime, what is the average percent of memory used of the requested memory for each group?

In [None]:
def wavg(group, avg_name, weight_name):
    """ http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
    In rare instance, we may not have weights, so just return the mean. Customize this if your business case
    should return otherwise.
    """
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

In [None]:
wavg(accre_df_mem, "PRU", "total_sec_used")