Code to infer the fleet usage statistics for our GPU machines

In [None]:
import clickhouse_connect
import pandas as pd
import os
import re
import logging
import matplotlib.pyplot as plt

# env loader
import dotenv
dotenv.load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logger.setLevel("INFO")

In [None]:
# set these variables in a local .env file:

CLICKHOUSE_HOST = os.environ['CLICKHOUSE_HOST']
CLICKHOUSE_USER = os.environ['CLICKHOUSE_USER']
CLICKHOUSE_PASSWORD = os.environ['CLICKHOUSE_PASSWORD']


In [None]:
client = clickhouse_connect.get_client(
    host=CLICKHOUSE_HOST,
    user=CLICKHOUSE_USER,
    password=CLICKHOUSE_PASSWORD,
    secure=True
)

In [None]:
def clean_runner_type(runner_type):
    runner_type = re.sub(r'am2\.', '', runner_type)
    runner_type = re.sub(r'amz2\.', '', runner_type)
    runner_type = re.sub(r'amz2023\.', '', runner_type)
    runner_type = re.sub(r'c\.', '', runner_type)
    runner_type = re.sub(r'.canary$', '', runner_type)
    runner_type = re.sub(r'lf\.', '', runner_type)

    return runner_type


def get_nvidia_jobs_run(client, weeks_ago: int = 2):
    query = """
    SELECT
        started_at,
        completed_at,
        age('minute', started_at, completed_at) AS duration_mins,
        arrayFirst(x -> x != 'self-hosted', labels) AS label,
        status,
        conclusion,
        name,
        url
    FROM
        workflow_job
    WHERE
        started_at >= subtractWeeks(now(), 2)
        AND length(arrayFilter(x -> x != 'self-hosted', labels)) > 0
        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%gpu%'
        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%nvidia%'
        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%.%'
        AND status = 'completed'
    """

    data = client.query(query).result_set

    df = pd.DataFrame(
        data,
        columns=[
            'started_at',
            'completed_at',
            'duration_mins',
            'label',
            'status',
            'conclusion',
            'name',
            'url']
    )

    # clean the data
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['completed_at'] = pd.to_datetime(df['completed_at'])
    df['duration_mins'] = df['duration_mins'].astype(int)
    df['label'] = df['label'].astype(str)
    df['status'] = df['status'].astype(str)
    df['conclusion'] = df['conclusion'].astype(str)
    df['name'] = df['name'].astype(str)

    df['label'] = df['label'].apply(clean_runner_type)

    return df


def get_runner_count_stats(job_run_df):
    # start when the first job was started_at
    start_time = job_run_df['started_at'].min()
    end_time = job_run_df['completed_at'].max()
    interval = pd.Timedelta(minutes=1)
    periods = pd.date_range(start=start_time, end=end_time, freq=interval)

    # Initialize a DataFrame to store period stats
    period_stats = pd.DataFrame(index=periods)


    # For each unique label, at each time period we compute how many jobs are running in parallel
    for label in job_run_df['label'].unique():
        # Filter jobs by label
        label_df = job_run_df[job_run_df['label'] == label]

        counts = []
        for period in periods:
            # Count jobs that are in progress during the interval
            count = label_df[(label_df['started_at'] <= period) & (label_df['completed_at'] > period)].shape[0]
            counts.append(count)

        period_stats[label] = counts

    return period_stats

In [None]:
num_weeks = 3
gpu_jobs_df = get_nvidia_jobs_run(client, num_weeks)

In [None]:
gpu_stats = get_runner_count_stats(gpu_jobs_df)

In [None]:
# Compute the quantiles only for the weekdays (Monday = 0, Friday = 4)
# For each label, get the p0, p5, p10, p90, p95, and p100 number of jobs in progress
quantiles = gpu_stats[gpu_stats.index.dayofweek < 5].quantile([0.1, 0.9, 0.95, 0.99, 1], axis=0).T

# sort quantiles by key
quantiles = quantiles.sort_index()
quantiles

In [None]:
# Chart the gpu_stats over time
# X-axis: time
# Y-axis: number of jobs in progress
# Each label is a line on the chart

# just plot the last week
gpu_week_stats = gpu_stats[gpu_stats.index >= gpu_stats.index.max() - pd.Timedelta(weeks=1)]

plt.figure(figsize=(20, 10))
for label in gpu_week_stats.columns:
    plt.plot(gpu_week_stats.index, gpu_week_stats[label], label=label)

plt.legend()
plt.title('Number of jobs in progress over time')
plt.xlabel('Time')
plt.ylabel('Number of jobs in progress')
plt.show()

