# Results of simulation of different fairness policies

These experiments use accumulated deficits to try to ensure that applications always receive their computed allocation of GPU time, even in the event of new jobs coming in and old jobs finishing, by keeping track of the difference between the GPU time the application should have received, and the GPU time the application actually received. Allocation of jobs to GPUs is performed in a round-based fashion, with GPUs instructed to run jobs for a fixed interval of time on all GPUs.

# Import statements

In [None]:
# Imports for plotting.
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from matplotlib.ticker import MultipleLocator
from pylab import *
import seaborn as sns
from matplotlib import rc
rc('text',
   usetex=True)
sns.set_style('ticks')
font = {
    'font.family':'Times New Roman',
    'font.weight': 200,
    'font.size': 10,
}
sns.set_style(font)
flatui = ['#002A5E', '#FD151B', '#8EBA42', '#348ABD', '#988ED5', '#BDB76B', '#8EBA42', '#FFB5B8']
sns.set_palette(flatui)
paper_rc = {
    'lines.linewidth': 2,
    'lines.markersize': 10,
    'mathtext.fontset': 'custom',
    'mathtext.rm': 'Times New Roman',
    'mathtext.bf': 'Times New Roman:bold',
}
sns.set_context("paper", font_scale=2,  rc=paper_rc)
current_palette = sns.color_palette()

In [None]:
# Other imports.
import json
import os
import re

# Get logfile paths

In [None]:
def get_logfile_paths_helper(directory_name):
    logfile_paths = []
    for root, _, file_names in os.walk(directory_name):
        if len(file_names) > 0:
            logfile_paths.extend(
                [os.path.join(root, file_name)
                 for file_name in file_names])
    return logfile_paths

def get_logfile_paths(directory_name):
    logfile_paths = []
    for logfile_path in get_logfile_paths_helper(directory_name):
        m = re.match(
            r'.*v100=(\d+)\.p100=(\d+)\.k80=(\d+)/(.*)/seed=(\d+)/'
             'lambda=(\d+\.\d+)\.log', logfile_path)
        v100s = int(m.group(1))
        p100s = int(m.group(2))
        k80s = int(m.group(3))
        policy = m.group(4)
        seed = int(m.group(5))
        l = float(m.group(6))
        logfile_paths.append((v100s, p100s, k80s, policy, seed,
                              l, logfile_path))
    return logfile_paths

# Munging functions

In [None]:
def prune(logfile_paths, v100s, p100s, k80s, policy, seed=None):
    if seed is None:
        return sorted([(x[5], x[6], x[4]) for x in logfile_paths
                       if x[0] == v100s and x[1] == p100s and
                       x[2] == k80s and x[3] == policy])
    else:
        return sorted([(x[5], x[6]) for x in logfile_paths
                       if x[0] == v100s and x[1] == p100s and
                       x[2] == k80s and x[3] == policy and
                       x[4] == seed])

In [None]:
def get_events(logfile_path):
    events = {}
    utilization = None
    with open(logfile_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            m = re.match(r'(\d+\.\d+).*scheduled.*Job ID: (\d+)\t'
                          'Worker type: (.*)\tWorker ID\(s\): ([\d+\,]*\d+).*',
                         line)
            if m is not None:
                start_timestamp = float(m.group(1))
                job_id = int(m.group(2))
                worker_type = m.group(3)
                worker_ids = [int(x) for x in m.group(4).split(',')]
                scale_factor = len(worker_ids)
                for worker_id in worker_ids:
                    if worker_id not in events:
                        events[worker_id] = []
                    events[worker_id].append([job_id, worker_type,
                                              scale_factor,
                                              start_timestamp,
                                              None])
            m = re.match(r'(\d+\.\d+).*scheduled.*Job ID: \((\d+), (\d+)\)\t'
                          'Worker type: (.*)\tWorker ID\(s\): ([\d+\,]*\d+).*',
                         line)
            if m is not None:
                start_timestamp = float(m.group(1))
                job_id1 = int(m.group(2))
                job_id2 = int(m.group(3))
                worker_type = m.group(4)
                worker_ids = [int(x) for x in m.group(5).split(',')]
                scale_facor = len(worker_ids)
                for worker_id in worker_ids:
                    if worker_id not in events:
                        events[worker_id] = []
                    events[worker_id].append([(job_id1, job_id2), worker_type,
                                              scale_factor,
                                              start_timestamp,
                                              None])
            
            m = re.match(r'(\d+\.\d+).*succeeded.*Job ID: (\d+)\t'
                          'Worker type: (.*)\tWorker ID: (\d+).*',
                         line)
            if m is not None:
                end_timestamp = float(m.group(1))
                job_id = int(m.group(2))
                worker_type = m.group(3)
                worker_id = int(m.group(4))
                if worker_id not in events:
                    continue
                assert(events[worker_id][-1][0] == job_id and
                       events[worker_id][-1][1] == worker_type and
                       events[worker_id][-1][4] is None)
                events[worker_id][-1][4] = end_timestamp
                
            m = re.match(r'(\d+\.\d+).*succeeded.*Job ID: \((\d+), (\d+)\)\t'
                          'Worker type: (.*)\tWorker ID: (\d+).*',
                         line)
            if m is not None:
                end_timestamp = float(m.group(1))
                job_id1 = int(m.group(2))
                job_id2 = int(m.group(3))
                worker_type = m.group(4)
                worker_id = int(m.group(5))
                if worker_id not in events:
                    continue
                assert(events[worker_id][-1][0] == (job_id1, job_id2) and
                       events[worker_id][-1][1] == worker_type and
                       events[worker_id][-1][4] is None)
                events[worker_id][-1][4] = end_timestamp
                
            m = re.match(r'Cluster utilization: (\d+\.\d+)', line)
            if m is not None:
                utilization = float(m.group(1)) * 100.

    return events, utilization


In [None]:
worker_type_map = {
    'k80': 0,
    'p100': 1,
    'v100': 2,
}
def get_num_promotions_per_job(logfile_path):
    events, _ = get_events(logfile_path)
    per_job_timeline = {}
    num_promotions_per_job = {}
    for worker_id in events:
        for i, (job_id, worker_type, scale_factor, start_time, _) in enumerate(
            events[worker_id]):
            worker_type_int = worker_type_map[worker_type]
            if type(job_id) == tuple:
                for job_id_ in job_id:
                    if job_id_ not in per_job_timeline:
                        per_job_timeline[job_id_] = []
                    per_job_timeline[job_id_].append((worker_type_int, start_time))
            else:
                if job_id not in per_job_timeline:
                    per_job_timeline[job_id] = []
                per_job_timeline[job_id].append((worker_type_int, start_time,
                                                 scale_factor))
    for job_id in per_job_timeline:
        per_job_timeline[job_id].sort(key=lambda x: x[1])
    for job_id in per_job_timeline:
        num_promotions_per_job[job_id] = 0
        for i, (worker_type, _, _) in enumerate(per_job_timeline[job_id]):
            if i == 0: continue
            if worker_type != per_job_timeline[job_id][i-1][0]:
                num_promotions_per_job[job_id] += 1
    return num_promotions_per_job

In [None]:
def sweep_get_num_promotions_per_job(relevant_logfile_paths):
    for lam, logfile_path, seed in relevant_logfile_paths:
        num_promotions_per_job = get_num_promotions_per_job(logfile_path)
        all_num_promotions = [num_promotions_per_job[job_id] for job_id in num_promotions_per_job]
        print('Lambda=%f, seed=%d: Average number of promotions: %f (stddev %f)' % (
            lam,
            seed,
            np.mean(all_num_promotions),
            np.std(all_num_promotions)))
        print('Distinct all_num_promotions:', set(all_num_promotions))
        print()

# Perform sweep

In [None]:
logfile_paths = sorted(get_logfile_paths(
    "/lfs/1/deepak/gpusched/scheduler/logs/multigpu_support_singlegpu/"))

In [None]:
relevant_logfile_paths = prune(logfile_paths, 8, 8, 8, 'fifo')
relevant_logfile_paths.reverse()
sweep_get_num_promotions_per_job(relevant_logfile_paths)

In [None]:
relevant_logfile_paths = prune(logfile_paths, 8, 8, 8, 'fifo_perf')
relevant_logfile_paths.reverse()
sweep_get_num_promotions_per_job(relevant_logfile_paths)

In [None]:
logfile_paths = sorted(get_logfile_paths(
    "/lfs/1/deepak/gpusched/scheduler/logs/multigpu_support_multigpu/"))

In [None]:
relevant_logfile_paths = prune(logfile_paths, 8, 8, 8, 'fifo')
relevant_logfile_paths.reverse()
sweep_get_num_promotions_per_job(relevant_logfile_paths)

In [None]:
relevant_logfile_paths = prune(logfile_paths, 8, 8, 8, 'fifo_perf')
relevant_logfile_paths.reverse()
sweep_get_num_promotions_per_job(relevant_logfile_paths)