In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from io import StringIO

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
# reformat the CSV to use | instead of , to separate rows, then import [a selection] of the data as a dataframe
for_pd = StringIO()
with open('../data/accre-jobs-2020.csv') as accre:
    for line in accre:
        new_line = re.sub(r',', '|', line.rstrip(), count=12)
        print (new_line, file=for_pd)
for_pd.seek(0)
accre_df = pd.read_csv(for_pd, sep='|')#[1000000:1005000] # add this to subset

In [None]:
accre_df.head()

In [None]:
accre_df.tail()

In [None]:
accre_df.info()

#### Need to change NODES and CPU to integers

In [None]:
accre_df['NODES'] = accre_df['NODES'].astype(int)
accre_df['CPUS'] = accre_df['CPUS'].astype(int)

In [None]:
accre_df.info()

#### We also need to convert the times to total seconds 

In [None]:
## This splits the hour, minutes, seconds from the __TIME columns
accre_df['hours_min_sec_req'] = accre_df['REQTIME'].str[-8:]
accre_df['hours_min_sec_used'] = accre_df['USEDTIME'].str[-8:]

## This splits the day from the ___TIME columns
accre_df['day_req'] = accre_df['REQTIME'].str.extract('(.*?)-')
accre_df['day_used'] = accre_df['USEDTIME'].str.extract('(.*?)-')

## Adds zeros to the day column where null
accre_df['day_req'] = accre_df['day_req'].fillna(0)
accre_df['day_used'] = accre_df['day_used'].fillna(0)

In [None]:
# Converting days to integers to use in converting to seconds
accre_df['day_req'] = accre_df['day_req'].astype(int)
accre_df['day_used'] = accre_df['day_used'].astype(int)

In [None]:
#converting to timedelta to then use dt.total_seconds()
accre_df['hours_min_sec_req'] =  pd.to_timedelta(accre_df['hours_min_sec_req'], unit='s')
accre_df['hours_min_sec_used'] =  pd.to_timedelta(accre_df['hours_min_sec_used'], unit='s')

accre_df['hours_min_sec_req'] = accre_df['hours_min_sec_req'].dt.total_seconds()
accre_df['hours_min_sec_used'] = accre_df['hours_min_sec_used'].dt.total_seconds()

In [None]:
accre_df.head()

In [None]:
#makes total seconds columns 
accre_df['total_sec_req'] = (accre_df['day_req'] * 86400) + accre_df['hours_min_sec_req']
accre_df['total_sec_used'] = (accre_df['day_used'] * 86400) + accre_df['hours_min_sec_used']

In [None]:
accre_df.head()

In [None]:
accre_df['ACCOUNT'].value_counts()

In [None]:
accre_df['STATE'].value_counts()

In [None]:
accre_df['PARTITION'].value_counts()

#### Do any of the production partition nodes show an unusual number of failed jobs relative to the others? (Ignore Debug Partition)

Let's check to see failure by nodes in the production partition

In [None]:
accre_df_failures = accre_df[
    (accre_df['STATE'] == 'FAILED') &
    (accre_df['PARTITION'] == 'production')
]

accre_df_failures = accre_df_failures.reset_index()

accre_df_failures.head(10)

In [None]:
accre_df_failures.info()

We started with 3,816,290 in our dataset and are now down to 395 failures after whittling it down to failures in the production partition. We're looking at a failure rate of .01%

In [None]:
accre_df_failures['NODELIST'].value_counts().head(20)

In [None]:
# create DF from value_counts by renaming the axis and reseting the index
failures_by_nodelist = accre_df_failures['NODELIST'].value_counts().rename_axis('NODELIST').reset_index(name='COUNTS')
failures_by_nodelist.head()

In [None]:
failures_by_nodelist['COUNTS'].mean()

The average number of failures by node in this list 1.47

In [None]:
failures_by_nodelist.plot(kind = 'hist', title = 'Failure Histogram', figsize = (10, 5));

As you can see from the histogram, we have outliers off to the right with some others that are failing more regularly

In [None]:
failures_by_nodelist.plot(kind='box');

Nodes that fail more than 3 times are considered outliers!

In [None]:
failures_by_nodelist[failures_by_nodelist['COUNTS'] > 3]

In [None]:
failures_by_nodelist.head(16).plot(
    kind = 'bar', 
    x = 'NODELIST',
    y = 'COUNTS',
    title = 'Top 16 Failures by Node (Outliers)',
    color = 'green', 
    figsize = (15,5),
    rot = 25, 
    fontsize = 12.5
);

Let's look at cn1273 specifically

In [None]:
cn1273_df = accre_df_failures[accre_df_failures['NODELIST'] == 'cn1273']
cn1273_df

According to the Exit Codes, all of the jobs failured due to user error for cn1273! Perhaps we should only look into failures that occured due to the job or node (ex: Exit Code 0:1) 

In [None]:
accre_df_failures[accre_df_failures['EXITCODE'].str.startswith('0:')]

Looks like every failure in the Production partition is due to user error. Let's see what accounts need the most help to avoid future failure

In [None]:
accre_df_failures['ACCOUNT'].value_counts().plot(kind ='bar', figsize=(10,5), rot=75);

In [None]:
accre_df_failures_by_account = accre_df_failures['ACCOUNT'].value_counts()
accre_df_failures_by_account

In [None]:
red_square = dict(markerfacecolor='r', marker='s')

plt.figure(figsize = (10,5))
plt.title('Outliers from Failures By Account', fontsize=16)
plt.xlabel('Failures')
plt.annotate(s = 'cep', xy = (128, 1.05), fontsize = 12,
             xytext = (127, 1.25), arrowprops=dict(facecolor='black', shrink=0.1, width=1))
plt.annotate(s = 'plantain', xy = (88, .95), fontsize = 12,
             xytext = (81, .75), arrowprops=dict(facecolor='black', shrink=0.1, width = 1))
plt.annotate(s = 'tips', xy = (65, 1.05), fontsize = 12,
             xytext = (65, 1.25), arrowprops=dict(facecolor='black', shrink=0.1, width = 1))
plt.boxplot(accre_df_failures_by_account,  
            vert = False, 
            flierprops=red_square);