In [None]:
import clickhouse_connect
import pandas as pd
import requests
import yaml
import os
import re
import logging
import time
import matplotlib.pyplot as plt
import seaborn as sns


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logger.setLevel("INFO")

In [None]:
# set env variable
os.environ['VANTAGE_TOKEN'] = ''
os.environ['CLICKHOUSE_HOST'] = ''
os.environ['CLICKHOUSE_USER'] = ''
os.environ['CLICKHOUSE_PASSWORD'] = ''

In [None]:
client = clickhouse_connect.get_client(
    host=os.getenv("CLICKHOUSE_HOST"),
    user=os.getenv("CLICKHOUSE_USER"),
    password=os.getenv("CLICKHOUSE_PASSWORD"),
    secure=True
)

SCALE_CONFIGS = [
    'https://raw.githubusercontent.com/pytorch/test-infra/main/.github/scale-config.yml',
    'https://raw.githubusercontent.com/pytorch/test-infra/main/.github/lf-scale-config.yml'
]

In [None]:
def clean_runner_type(runner_type):
    runner_type = re.sub(r'am2\.', '', runner_type)
    runner_type = re.sub(r'amz2\.', '', runner_type)
    runner_type = re.sub(r'amz2023\.', '', runner_type)
    runner_type = re.sub(r'c\.', '', runner_type)
    runner_type = re.sub(r'.canary$', '', runner_type)

    return runner_type

In [None]:
# now use clickhouse to get all entries in the last DAYS days for the table workflow_job

days_back = 14
start_time = time.time()

start_date = pd.Timestamp.today().date() - pd.Timedelta(days=days_back)
logger.info(f"Will fetch data from the last {days_back} days")

COLUMNS = ['started_at', 'name', 'labels', 'dynamoKey', 'completed_at',
            'conclusion', 'created_at', 'steps', 'head_branch', 'workflow_name']

# max of MAX_DAYS days after start_date
end_date = pd.Timestamp.today().date()
logger.info(f"Getting data from {start_date} to {end_date} (inclusive)")
query_start_time = time.time()
query = f"select {','.join(COLUMNS)} from workflow_job where started_at >= DATE('{start_date}') and started_at <= DATE('{end_date}')"
workflow_jobs = client.query(query).result_set
logger.info(f"Query took {time.time() - query_start_time:.2f} seconds")

workflow_jobs_df = pd.DataFrame(workflow_jobs, columns=COLUMNS)
logger.info(f"Found {workflow_jobs_df.shape[0]} entries")


In [None]:
# remove everything that's not finished yet (completed_at is still 1970)
workflow_jobs_df = workflow_jobs_df[workflow_jobs_df['completed_at'].dt.year > 2000]

# remove the item 'self-hosted' from the list of runner types
workflow_jobs_df['labels'] = workflow_jobs_df['labels'].map(
    lambda l: [x for x in l if x != 'self-hosted'])

workflow_jobs_df['runner_type'] = workflow_jobs_df['labels'].map(lambda l: l[0] if len(l) > 0 else None)

# drop rows where labels is empty
workflow_jobs_df = workflow_jobs_df[workflow_jobs_df['labels'].map(
    len) > 0]

workflow_jobs_df['group_repo'] = workflow_jobs_df['dynamoKey'].map(
    lambda s: '/'.join(s.split('/')[:2]))



In [None]:
out = {}
for runner_type in workflow_jobs_df['runner_type'].unique():
    out[runner_type] = {}
    for d in range(days_back+1):
        # hour is 0 padded
        for hour in range(24):
            for minute in range(0, 60):
                out[runner_type][f"{start_date + pd.Timedelta(days=d)}_{hour:02d}-{minute:02d}"] = 0

In [None]:
# now we loop over the dataframe and add the cost to the right key. The cost needs to be added to all hours where the job was running, so if a job ran from 1:20 to 2:40, it should add +1 to the 1-2h and +1 to the 2-3h key
print("TOTAL:    "+"."*100)
print("PROGRESS: ", end="")
tot=0
for index, row in workflow_jobs_df.iterrows():
    if index % (len(workflow_jobs_df)//100) == 99:
        print(".", end="")
    start = row['started_at']
    end = row['completed_at']
    runner_type = row['runner_type']
    
    cur_dt = start.floor('min')
    
    while cur_dt < end:
        cur_dt_str = cur_dt.strftime('%Y-%m-%d_%H-%M')
        out[runner_type][cur_dt_str] += 1
        cur_dt += pd.Timedelta(minutes=1)
        tot += 1
        
print("\nFINISHED")
print(f"Total: {tot}")
       
# now make a df from the out dict
out_df = pd.DataFrame(out).T
out_df.columns = pd.to_datetime(out_df.columns, format='%Y-%m-%d_%H-%M')


In [None]:
plt.figure(figsize=(20, 10))

# only the top 10 lines
# show x-axis as continuous date, not as every timestamp
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))

plt.title(f"# runners per type in the last {days_back} days")
plt.ylabel("Parallel runners")
plt.xlabel("Date")

# take the top 10, only the ones that have nvidia in the name
out_df_nvidia = out_df[out_df.index.str.contains('nvidia', case=False)]

for runner_type in out_df_nvidia.T.sum().sort_values(ascending=False).index[:10]:
    sns.lineplot(data=out_df_nvidia.loc[runner_type], label=runner_type)
plt.show()

In [None]:
out_df_summary = out_df.copy()
out_df_summary['average'] = out_df.mean(axis=1).round(0).astype(int)
out_df_summary['p90'] = out_df.quantile(0.9, axis=1).round(0).astype(int)
out_df_summary['p99'] = out_df.quantile(0.99, axis=1).round(0).astype(int)

out_df_summary = out_df_summary.sort_values('average', ascending=False)

out_df_summary = out_df_summary[out_df_summary.index.str.contains('nvidia', case=False)]
pd.set_option('display.max_rows', None)
display(out_df_summary[['average', 'p90', 'p99']])
pd.reset_option('display.max_rows')