In [1]:
import time
import os

from runs_collector.dataset import RunsDataSet
from runs_analysis.resource_usage import (get_resource_usage_summary, 
                                          triggering_events_proportion, 
                                          triggering_events_time_proportion, 
                                          get_avg_runtime_by_event,
                                          get_tiers,
                                          calc_costs_by_event as calc_costs,
                                          get_avg_runtime_rest)
from runs_analysis.name_based_analysis import get_topk_jobs_names,  get_topk_jobs_time, simplify_and_map

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Loading the data

* Loading the dataset takes around 1 minutes
* The data size in RAM is around 9GB

In [4]:
start = time.time()

data_set = RunsDataSet(None, None, from_checkpoint=True, checkpoint_dir="./")

end = time.time()

print("Time taken to load the dataset:", round(end - start, 0), "seconds")

Loading dataset from checkpoints
Time taken to load the dataset: 63.0 seconds


## Resource usage by event name (Table 1 in PDF)

In [5]:
repos_list_1, repos_list_2 = get_tiers(data_set)
runs_prop_1 = triggering_events_proportion(data_set, repos_list_1)
runs_prop_2 = triggering_events_proportion(data_set, repos_list_2)
time_prop_1 = triggering_events_time_proportion(data_set, repos_list_1)
time_prop_2 = triggering_events_time_proportion(data_set, repos_list_2)
avg_time_1 = get_avg_runtime_by_event(data_set, repos_list_1)
avg_time_2 = get_avg_runtime_by_event(data_set, repos_list_2)

In [9]:
def show_event_summary(event):
    print("{:<30} {:<6} {:<6} {:<6} {:<6} {:<6} (iqr={:<6}) {:<6} (iqr={:<6}) {:<6} {:<6}".format(
        event,
        round(time_prop_1[event], 1),
        round(time_prop_2[event], 1),
        round(runs_prop_1[event], 1),
        round(runs_prop_2[event], 1),
        round(avg_time_1[event][0], 1),
        round(avg_time_1[event][-1], 1),
        round(avg_time_2[event][0], 1),
        round(avg_time_2[event][-1], 1),
        round(calc_costs(avg_time_1[event][0], event), 2),
        round(calc_costs(avg_time_2[event][0], event), 2)
         ))

In [10]:
def calc_others():
    c1 = 0
    c2 = 0
    c3 = 0
    c4 = 0
    c5 = 0
    c6 = 0
    c7 = 0
    c8 = 0
    for event in ["pull_request", "push", "schedule", "pull_request_target", "workflow_dispatch", "workflow_run", "release"]:
        c1 += round(time_prop_1[event], 1)
        c2 += round(time_prop_2[event], 1)
        c3 += round(runs_prop_1[event], 1)
        c4 += round(runs_prop_2[event], 1)
        
    rest_2 = get_avg_runtime_rest(data_set, repos_list_2, 
                              ["push", "pull_request", "target_pull_request", "schedule", "workflow_dispatch", "release", "workflow_run"])
    rest_1 = get_avg_runtime_rest(data_set, repos_list_1, 
                              ["push", "pull_request", "target_pull_request", "schedule", "workflow_dispatch", "release", "workflow_run"])
    c5 = rest_1[0]
    c6 = rest_1[-1]
    c7 = rest_2[0]
    c8 = rest_2[-1]
    return 100- c1, 100 -c2, 100 - c3, 100 - c4, c5, c6, c7, c8   

In [11]:
print("{:<30} {:<48} {:<48}".format(
        "",
        "       Overall(%)",
        "Avg. per run",
         ))
print(" "*30 + "-"*80)
print("{:<30} {:<15} {:<12} {:<36} {:<12}".format(
        "",
        " VM time",
        " Runs",
        "  VM time (min)",
        "VM cost ($)",
         ))
print(" "*30 + "-"*80)
print("{:<30} {:<6} {:<6} {:<6} {:<6} {:<18} {:<18} {:<6} {:<6}".format(
        "Event",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free"
         ))
print("-"*110)
for event in ["pull_request", "push", "schedule", "pull_request_target", "workflow_dispatch", "workflow_run", "release"]:
    show_event_summary(event)

others = calc_others()
print("{:<30} {:<6} {:<6} {:<6} {:<6} {:<6} (iqr={:<6}) {:<6} (iqr={:<6}) {:<6} {:<6}".format(
        "Others",
        round(others[0], 1),
        round(others[1], 1),
        round(others[2], 1),
        round(others[3], 1),
        round(others[4], 1),
        round(others[5], 1),
        round(others[6], 1),
        round(others[7], 1),
        round(calc_costs(others[4], "others"), 2),
        round(calc_costs(others[6], "others"), 2)
         ))

                                      Overall(%)                                Avg. per run                                    
                              --------------------------------------------------------------------------------
                                VM time         Runs          VM time (min)                      VM cost ($) 
                              --------------------------------------------------------------------------------
Event                          Paid   Free   Paid   Free   Paid               Free               Paid   Free  
--------------------------------------------------------------------------------------------------------------
pull_request                   50.7   35.5   38.6   25.3   31.1   (iqr=20.1  ) 3.6    (iqr=3.6   ) 0.38   0.04  
push                           30.9   47.8   26.4   28.6   28.4   (iqr=19.5  ) 4.3    (iqr=4.2   ) 0.35   0.05  
schedule                       15.5   14.5   26.2   40.3   13.8   (iqr=1.3   ) 0.9    (iqr=

## Resource usage by task (Table 2 in PDF)

In [9]:
top_k_jobs_1 = get_topk_jobs_names(data_set, repos_list_1, k=10).reset_index()
top_k_jobs_2 = get_topk_jobs_names(data_set, repos_list_2, k=100).reset_index()

In [10]:
all_runs = data_set.get_all_runs()
all_runs_1 = all_runs[all_runs.repo_id.isin(repos_list_1)]
all_runs_2 = all_runs[all_runs.repo_id.isin(repos_list_2)]
all_jobs = data_set.get_all_jobs()

all_jobs_1 = all_jobs[all_jobs.run_id.isin(all_runs_1.id)]
all_jobs_2 = all_jobs[all_jobs.run_id.isin(all_runs_2.id)]

top_k_jobs_1["prop"] = top_k_jobs_1["count"]/top_k_jobs_1["count"].sum()
top_k_jobs_2["prop"] = top_k_jobs_2["count"]/top_k_jobs_2["count"].sum()

top_k_jobs_names = top_k_jobs_1.job_name
assert set(top_k_jobs_names) & set(top_k_jobs_2.job_name) == set(top_k_jobs_names)

### Calculating Proportions

In [11]:
top_k_jobs_1.sort_values(["prop"], ascending=False)
paid_prop = {name: round(top_k_jobs_1[top_k_jobs_1.job_name==name]["prop"].to_list()[0]*100, 1) for name in top_k_jobs_names}
sum_free = top_k_jobs_2[top_k_jobs_2.job_name.isin(top_k_jobs_names)]["count"].sum()
free_prop = []
free_prop = {name: round(top_k_jobs_2[top_k_jobs_2.job_name==name]["count"].to_list()[0]/sum_free*100, 1) for name in top_k_jobs_names}

#### get_topk_jobs_time(data_set)

In [12]:
def get_topk_jobs_time(data_set, repos_list, k=10):
    all_jobs = simplify_and_map(data_set, repos_list)
    top_k = get_topk_jobs_names(data_set, repos_list, k=k)
    top_tasks_jobs = all_jobs[all_jobs.sub_names.isin(top_k.job_name)]
    top_tasks_jobs["up_time_min"] = top_tasks_jobs.up_time/60
    sum_time_tasks = top_tasks_jobs.groupby("sub_names").up_time.agg(["sum"]).reset_index("sub_names")
    sum_time_tasks["prop"] = sum_time_tasks["sum"]/sum_time_tasks["sum"].sum()*100
    return sum_time_tasks

In [13]:
topk_jobs_time = get_topk_jobs_time(data_set, repos_list_1)

### Average tasks time

In [14]:
tasks = ["test", "build", "release", "analyze", "lint", "linux", "update", "integration", "deploy", "sync"]
all_jobs_1 = simplify_and_map(data_set, repos_list_1)
all_jobs_2 = simplify_and_map(data_set, repos_list_2)

all_jobs_1["up_time_min"] = all_jobs_1.up_time / 60
all_jobs_2["up_time_min"] = all_jobs_2.up_time / 60

In [15]:
def calc_costs_by_task(time, task):
    os_factor = 1.52
    return time * os_factor * 0.008

In [16]:
sum_t1 = sum([all_jobs_1[all_jobs_1.sub_names==t].up_time.sum() for t in tasks])
sum_t2 = sum([all_jobs_2[all_jobs_2.sub_names==t].up_time.sum() for t in tasks])

In [17]:
total_runs_1 = all_jobs_1[all_jobs_1.sub_names.isin(tasks)].shape[0]
total_runs_2 = all_jobs_2[all_jobs_2.sub_names.isin(tasks)].shape[0]

In [18]:
print("{:<30} {:<48} {:<48}".format(
        "",
        "       Overall(%)",
        "Avg. per run",
         ))
print(" "*30 + "-"*80)
print("{:<30} {:<15} {:<12} {:<36} {:<12}".format(
        "",
        "  VM time",
        " Runs",
        "  VM time (min)",
        "VM cost ($)",
         ))
print(" "*30 + "-"*80)
print("{:<30} {:<6} {:<6} {:<6} {:<6} {:<18} {:<18} {:<6} {:<6}".format(
        "Task",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free"
         ))
print("-"*110)
for t in tasks:
        print("{:<30} {:<6} {:<6} {:<6} {:<6} {:<6} (iqr={:<6}) {:<6} (iqr={:<6}) {:<6} {:<6}".format(
                t,
                round(all_jobs_1[all_jobs_1.sub_names==t].up_time.sum()/sum_t1*100, 1),
                round(all_jobs_2[all_jobs_2.sub_names==t].up_time.sum()/sum_t2*100, 1),
                round(all_jobs_1[all_jobs_1.sub_names==t].shape[0]/total_runs_1*100, 1),
                round(all_jobs_2[all_jobs_2.sub_names==t].shape[0]/total_runs_2*100, 1),
                round(all_jobs_1[all_jobs_1.sub_names==t].up_time_min.sum()/all_jobs_1[all_jobs_1.sub_names==t].shape[0], 1),
                round(all_jobs_1[all_jobs_1.sub_names==t].up_time_min.quantile(0.75) - all_jobs_1[all_jobs_1.sub_names==t].up_time_min.quantile(0.25), 1),
                round(all_jobs_2[all_jobs_2.sub_names==t].up_time_min.sum()/all_jobs_2[all_jobs_2.sub_names==t].shape[0], 1),
                round(all_jobs_2[all_jobs_2.sub_names==t].up_time_min.quantile(0.75) - all_jobs_2[all_jobs_2.sub_names==t].up_time_min.quantile(0.25), 1),
                round(calc_costs_by_task(round(all_jobs_1[all_jobs_1.sub_names==t].up_time_min.sum()/all_jobs_1[all_jobs_1.sub_names==t].shape[0], 1), t), 2),
                round(calc_costs_by_task(round(all_jobs_2[all_jobs_2.sub_names==t].up_time_min.sum()/all_jobs_2[all_jobs_2.sub_names==t].shape[0], 1), t), 2)
        ))

                                      Overall(%)                                Avg. per run                                    
                              --------------------------------------------------------------------------------
                                 VM time        Runs          VM time (min)                      VM cost ($) 
                              --------------------------------------------------------------------------------
Task                           Paid   Free   Paid   Free   Paid               Free               Paid   Free  
--------------------------------------------------------------------------------------------------------------
test                           54.6   37.3   50.9   36.2   8.1    (iqr=7.2   ) 1.5    (iqr=1.3   ) 0.1    0.02  
build                          36.6   50.8   28.5   49.9   9.7    (iqr=8.4   ) 1.5    (iqr=1.3   ) 0.12   0.02  
release                        3.5    1.3    2.4    2.0    11.0   (iqr=20.1  ) 1.0    (iqr=

## Termination status (Table 3 in PDF)

In [19]:
all_runs = data_set.get_all_runs()
all_jobs = data_set.get_all_jobs()

all_runs_1 = all_runs[all_runs.repo_id.isin(repos_list_1)]
all_jobs_1 = all_jobs[(all_jobs.run_id.isin(all_runs_1.id)) & (all_jobs.id!=3253494537)]


all_runs_2 = all_runs[all_runs.repo_id.isin(repos_list_2)]
all_jobs_2 = all_jobs[(all_jobs.run_id.isin(all_runs_2.id)) & (all_jobs.id!=3253494537)]

conclusion_1 = all_runs_1.groupby("conclusion").agg({"id": "count"}).reset_index()
conclusion_2 = all_runs_2.groupby("conclusion").agg({"id": "count"}).reset_index()
conclusion_1["prop"] = conclusion_1["id"] * 100/conclusion_1["id"].sum()
conclusion_2["prop"] = conclusion_2["id"] * 100/conclusion_2["id"].sum()

In [20]:
runs_with_time_1 = all_runs_1.merge(all_jobs_1[["run_id", "up_time"]], left_on="id", right_on="run_id")
runs_with_time_2 = all_runs_2.merge(all_jobs_2[["run_id", "up_time"]], left_on="id", right_on="run_id")

time_per_conclusion_1 = runs_with_time_1.groupby("conclusion").agg({"up_time": "sum"}).reset_index()
time_per_conclusion_1["prop"] = time_per_conclusion_1["up_time"]*100/time_per_conclusion_1.up_time.sum()

time_per_conclusion_2 = runs_with_time_2.groupby("conclusion").agg({"up_time": "sum"}).reset_index()
time_per_conclusion_2["prop"] = time_per_conclusion_2["up_time"]*100/time_per_conclusion_2.up_time.sum()

In [21]:
print("{:<30} {:<24} {:<20}".format(
        "",
        "Runs proportion %",
        "VM time proportion %",
         ))
print(" "*30 + "-"*50)
print("{:<30} {:<12} {:<12} {:<12} {:<12}".format(
        "Conclusion",
        "Paid",
        "Free",
        "Paid",
        "Free"
         ))
print("-"*80)
for c in ["success", "failure", "skipped", "cancelled", "startup_failure", "action_required", "stale"]:
    print("{:<30} {:<12} {:<12} {:<12} {:<12}".format(
        c,
        round(conclusion_1[conclusion_1.conclusion==c].prop.to_list()[0], 1),
        round(conclusion_2[conclusion_2.conclusion==c].prop.to_list()[0], 1),
        round(time_per_conclusion_1[time_per_conclusion_1.conclusion==c].prop.to_list()[0], 1),
        round(time_per_conclusion_2[time_per_conclusion_2.conclusion==c].prop.to_list()[0], 1)
         ))

                               Runs proportion %        VM time proportion %
                              --------------------------------------------------
Conclusion                     Paid         Free         Paid         Free        
--------------------------------------------------------------------------------
success                        78.7         88.9         66.4         81.1        
failure                        17.4         10.0         30.9         18.0        
skipped                        2.2          0.6          0.0          0.0         
cancelled                      1.5          0.3          2.7          0.8         
startup_failure                0.1          0.1          0.0          0.0         
action_required                0.0          0.1          0.0          0.0         
stale                          0.0          0.0          0.0          0.0         
